In [1]:
# create pdf archive of urls from url files
# 2018/12/14

chromiumbrowser_path = ''
#chromiumbrowser_path = "/usr/lib/chromium-browser/chromium-browser"
#chromiumbrowser_path = "/usr/lib/chromium/chromium"

import logging, os, tempfile, asyncio, urllib.parse
import requests, dns.resolver, validators, pyppeteer


def setup_pydrive (client_id = ''): # uses stored secrets, otherwise uses id
    # check if secret file exists already, if not create and delete when done
    del_client_secrets = False
    if not os.path.exists('client_secrets.json'):
        if client_id == '':
            raise Exception('Need client_secrets.json in cwd or client_id param')
        with open('client_secrets.json', 'w') as client_secrets:
            client_secrets.write(client_id)
        del_client_secrets = True
    import pydrive.auth
    gauth = pydrive.auth.GoogleAuth()
    gauth.CommandLineAuth() # provides link, paste code
    if del_client_secrets: os.remove('client_secrets.json')
    import pydrive.drive
    return pydrive.drive.GoogleDrive(gauth)


def move_file (file, dest_id, token): # update file replacing parent id
    new_parent = [{
        'id': dest_id,
        'isRoot': False,
        'kind': 'drive#parentReference',
        'parentLink':
        'https://www.googleapis.com/drive/v2/files/' + dest_id,
        'selfLink':
        f'https://www.googleapis.com/drive/v2/files/{token}/parents/' +
        dest_id } ]
    file['parents'] = new_parent
    try:
        file.Upload()
    except Exception as error:
        raise Exception(error)


def get_file (drive, folder_id): # get files from folder on Google pydrive
    param = {}
    param['maxResults'] = 1
    param['q'] = "('" + folder_id + "' in parents) and (mimeType != 'application/vnd.google-apps.folder') and trashed=false"
    for file_list in drive.ListFile({**param}):
        for file in file_list:
            yield file


def extract_url (url_file): # extract URL from '.url' internet shortcut file
    return url_file.GetContentString().splitlines()[1][4:]


def check_url_dns (url):
    url_resolver = dns.resolver.Resolver()
    url_resolver.nameservers = ['8.8.8.8', '8.8.4.4']
    try: url_resolver.query(urllib.parse.urlsplit(url).netloc)
    except: return False
    return True


def check_url (url):
    mLog.debug('validating url: {}'.format(url))
    if not validators.url(url):
        mLog.info('url did not validate')
        return False
    mLog.debug('checking dns')
    if not check_url_dns(url):
        mLog.info('unable to resolve dns')
        return False
    mLog.debug('requesting http head')
    try:
        r = requests.head(url, allow_redirects=True)
        r.raise_for_status()
    except:
        mLog.info('http head request failed', exc_info=True)
        return False
    else:
        mLog.info('no connection error')
        if r.ok: 
            return True
        else:
            mLog.info('response not ok: {}'.format(r))
            return False
    raise Exception('check_url - this should be unreachable')


def guess_protocol (url):
    schemes = ['https:', 'http:']
    for scheme in schemes:
        mLog.debug('trying protocol: {}'.format(scheme))
        if check_url(scheme + url):
            return scheme
    mLog.info('no schemes passed check_url')
    return ''


async def fetch_url (url, pdf_out, html_out):
    try:
        mLog.info('fetching url')
        options = {'args': [
                '--no-sandbox',
                '--disable-setuid-sandbox']}
        options['ignoreHTTPSErrors'] = True
        options['headless'] = True 
        if chromiumbrowser_path:
            options['executablePath'] = chromiumbrowser_path
            mLog.debug('chromium browser executable path set: {}'.format(chromiumbrowser_path))
        browser = await pyppeteer.launch( )
        page = await browser.newPage()
        mLog.info('getting page')
        await page.goto(url)
        mLog.debug('getting html')
        if html_out != None:
            html = await page.content()
            html_out.write(html.encode('utf-8'))
            html_out.seek(0)
        mLog.debug('getting pdf')
        if pdf_out != None:
            pdf = await page.pdf()
            pdf_out.write(pdf)
            pdf_out.seek(0)
        mLog.debug('closing browser')
        await browser.close()
        return True
    except:
        mLog.error(error, exc_info=True)
        raise Exception(error)


async def get_result (url, drive, name, parent_id):
    try:
        pdf_tmp = tempfile.NamedTemporaryFile(delete=False)
        html_tmp = tempfile.NamedTemporaryFile(delete=False)
        await fetch_url(url, pdf_tmp, html_tmp)
        mLog.debug('saving pdf')
        pdf_out = drive.CreateFile({'title': name + '.pdf'})
        pdf_out.SetContentFile(pdf_tmp.name)
        move_file(pdf_out, vars_input['outs_id'], vars_input['token']) # also Uploads
        pdf_tmp.close()
        mLog.debug('saving html')
        html_out = drive.CreateFile({'title': name + '.html'})
        html_out.SetContentFile(html_tmp.name)
        move_file(html_out, vars_input['outs_id'], vars_input['token']) # also Uploads
        html_tmp.close()
        return True
    except:
        mLog.info('error in get_result', exc_info=True)
        return False


In [2]:
    vars_input = {
        'src_id': '',
        'proc_id': '',
        'fail_id': '',
        'done_id': '',
        'outs_id': '',
        'token': ''
    }
    client_id = ''

    src = ''
    prc = vars_input['proc_id']

In [None]:
drive = setup_pydrive(client_id)