## Recursive GoIndex Downloader by atlonxp

**Features**
*   Recursive crawler (**atlonxp**)
*   Download all folders and files in a given url (**atlonxp**)
*   Download all folders and files in in sub-folders (**atlonxp**)
*   Adaptive delay in fetching url (**atlonxp**)
*   Store folders/files directly to your Google Drive (**pankaj260**)
*   Folders and files exclusion filters (**atlonxp**)
*   Download queue supported (**atlonxp**)
*   Auto-domain URL detection (**atlonxp**)
*   API-based GoIndex crawler (**atlonxp**, **ifvv**)
*   Parallel/Multiple files downloader (**atlonxp**)
*   Auto-skip password-protected folders (**cxu-fork**)

**Version 2** - API-based crawler with paralled files downloader

	21 Aprial 2020 (v2.3.1)
	---------------------
	While crawling, fetching might cause errors sometime due to some quick requests or server is 
	busy. This problem has caused the eror in getting a json, so we re-fetch the url again (up to 
	MAX_RETRY_CRAWLING) or until we found key "files" in the return response. Once retries is 
	reached the maximum and the key "files" is not found, so we ignore this link (return [])

	At the end, if you find there is failure, just re-run the download section again. Unless you 
	set OVERWITE = TRUE, all files will be re-downloaded

	+ added MAX_RETRY_CRAWLING (v2.3)
	+ fixed FILE_EXISTING_CHECK (stupid) bug
	+ added failure-links download task

	20 Aprial 2020 (v2.2)
	---------------------
	Some sub-folders may be password-protected which will cause the error while crawling, so we 
	skip this folder

	+ added auto-skip password-protected folder

	17 April 2020 (v2.1)
    ---------------------
	+ fixed URL duplicated when crawling
	+ added search key 'files' function

	16 April 2020 (v2.0)
    ---------------------
	+ crawler_v2:
		* API-based GoIndex crawler
		* Collecting all urls to be downloaded
	+ parallel downloader
		* TDQM progress bar

In [0]:
# Mounting Google Drive, ignore this section if you don't want to 
# save on your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Install dependencies
!pip install requests tqdm

In [0]:
# Import dependencies

import json
from json import JSONDecodeError

import multiprocessing
import os
from pathlib import Path
from random import randint
from time import sleep
from urllib import parse

import requests
import tqdm

In [0]:
SHOW_DOWNLOAD_PROGRESS = False
OVERWRITE = True

MIN_DELAY = 3
MAX_DELAY = 5
MAX_RETRY_CRAWLING = 5

def check_exclusion(name, exclusions):
    for exc in exclusions:
        if exc in name:
            return True
    return False


def find(key, dictionary):
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result


def crawler_v2(url, downloading_dict, path, level, exclusions, verbose=False):
    # let slow down a bit
    sleep(randint(MIN_DELAY, MAX_DELAY))

    url = parse.urlparse(url)
    print(url.geturl())

    try:
        response_text = ''
        retry = 0
        while 'files' not in response_text:
            retry += 1
            if retry > MAX_RETRY_CRAWLING:
                break
            if retry > 1:
                print('retry #{}'.format(retry), url.geturl())
                sleep(randint(MIN_DELAY, MAX_DELAY))
            response = requests.post(url.geturl(), data={})
            response_text = response.text
        # print(response.text)
        response_json = json.loads(response_text)
    except JSONDecodeError:
        sleep(randint(MIN_DELAY, MAX_DELAY))
        print('- Data is missing! change a plan -')
        print('- > use terminal CURL            -')
        try:
            response = os.popen("curl {} -d ''".format(url.geturl())).read()
            response_json = json.loads(response)
        except Exception as e:
            print('Nah, something went wrong!')
            print(e.args())
            return []
    except Exception as e:
        print('Nah, something went wrong!')
        print(e.args())
        return []

    if type(response_json) == dict and 'error' in response_json.keys():
        print('Skip: ', response_json)
        return downloading_dict

    files_dict = list(find('files', response_json))[0]

    for file in files_dict:
        name = file['name']

        # if @name contains exclusion word, we ignore
        if check_exclusion(name, exclusions):
            continue

        if 'folder' in file['mimeType']:
            next_url = url.geturl() + parse.quote(name) + "/"
            next_path = os.path.join(path, name)
            downloading_dict = crawler_v2(next_url, downloading_dict, next_path, level + 1, exclusions, verbose)
        else:
            name = file['name']
            if verbose:
                print('  ' + name)
            downloading_dict.append({
                'folder': path,
                'filename': name,
                'filename_abs': os.path.join(path, name),
                'size': file['size'],
                'url': url.geturl() + parse.quote(name),
            })

    # print(json.dumps(downloading_dict, indent=2), end='\n\n')
    return downloading_dict


def download_agent(task, OVERWRITE=OVERWRITE):
    if task is None:
        return None

    # Making multiple requests too quick can cause yourself banned, so let set random delay (1, 10)
    sleep(randint(MIN_DELAY, MAX_DELAY))

    folder = task['folder']
    filename_abs = task['filename_abs']
    url = task['url']

    Path(folder).mkdir(parents=True, exist_ok=True)
    if not os.path.exists(filename_abs) or OVERWRITE:
        r = requests.get(url, stream=True)
        if r.status_code is not 200:
            return task
        with open(filename_abs, 'ab+') as f:
            f.write(r.content)
    return None

In [0]:
MAX_DOWNLOAD_TASKS = 16
exclusions = ['__MACOSX/']

destination = "/content/drive/My Drive/Knowledge/_Trainings/_download"
download_tasks = [
    {
        'folder': 'FrontEndMasters - Complete Intro to Containers',
        'url': 'https://tutnetflix.mlwdl.workers.dev/FrontEndMasters%20-%20Complete%20Intro%20to%20Containers/'
    },
    {
        'folder': 'test',
        'url': 'https://gdrv.icu/0:/Star/宮崎あや/2013-2016/'
    },
]

print('##################################')
print('# Crawling all downloadable urls #')
print('##################################', end='\n\n')
tasks = []
for task in download_tasks:
    tasks += crawler_v2(task['url'], [], os.path.join(destination, task['folder']), 0, exclusions, verbose=False)
    # print(json.dumps(tasks, indent=2), end='\n\n')

# print(json.dumps(tasks, indent=2))
print('\nCollecting', len(tasks), 'is completed', end='\n\n')



In [0]:
print('##################################')
print('# Downloading files and folders  #')
print('##################################', end='\n\n')
pool = multiprocessing.Pool(processes=MAX_DOWNLOAD_TASKS)  # Num of CPUs

failures = []
tasks_list = [task.get('filename') for task in tasks]
with tqdm.tqdm(total=len(tasks)) as pbar:
    for i, result in enumerate(pool.imap_unordered(download_agent, tasks)):
        pbar.set_description('Downloading %s' % tasks_list[i])
        failures.append(result)
        pbar.update()

failures = [failure for failure in failures if failure is not None]
if len(failures) > 0:
    print('\n\n##################################')
    print('# Retry all {} failures          #'.format(len(failures)))
    print('##################################')
    with tqdm.tqdm(total=len(failures)) as pbar:
        for i, result in enumerate(pool.imap_unordered(download_agent, failures)):
            pbar.set_description('Downloading %s' % tasks_list[i])
            if result is None:
                del failures[i]
            failures.append(result)
            pbar.update()

pool.close()
pool.terminate()

print('\nAll done, Voila!')

print('\nAll done, Voila!')