## Recursive GoIndex Downloader by atlonxp

This code was created and improved by adapting the code from pankaj260 https://colab.research.google.com/drive/1tmsLGuswIZIZ_oM35EMW8TbJ6pQPt1rY#scrollTo=3bCnUMUg_SoT&forceEdit=true&sandboxMode=true

**Features**
*   Recursive crawler (atlonxp)
*   Download all folders and files in a given url (atlonxp)
*   Download all folders and files in in sub-folders (atlonxp)
*   Adaptive delay in fetching url (atlonxp)
*   Store folders/files directly to your Google Drive (pankaj260)
*   Folders and files exclusion filters
*   Download queue supported
*   Auto-domain URL detection
*   API-based GoIndex crawler
*   Parallel/Multiple files downloader

**Version 2**:

	17 April 2020 (v2.1)

	+ fixed URL duplicated when crawling
	+ added search 'files' key for some websites do not have proper files structure. So, we search it

	16 April 2020

	+ crawler_v2:
		* API-based GoIndex crawler
		* Collecting all urls to be downloaded
	+ parallel downloader
		* TDQM progress bar

**Version 1**:

	15 April 2020
	-   Added auto-domain URL detection
	-   Added simple download queue

	14 April 2020
		-   initial

In [0]:
# Mounting Google Drive, ignore this section if you don't want to 
# save on your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Install dependencies
!pip install requests tqdm

In [0]:
# Import dependencies

import json
from json import JSONDecodeError

import multiprocessing
import os
from pathlib import Path
from random import randint
from time import sleep
from urllib import parse

import requests
import tqdm

In [0]:
SHOW_DOWNLOAD_PROGRESS = False
OVERWRITE = True

MIN_DELAY = 3
MAX_DELAY = 5


def check_exclusion(name, exclusions):
    for exc in exclusions:
        if exc in name:
            return True
    return False


def find(key, dictionary):
    for k, v in dictionary.items():
        if k == key:
            yield v
        elif isinstance(v, dict):
            for result in find(key, v):
                yield result
        elif isinstance(v, list):
            for d in v:
                for result in find(key, d):
                    yield result


def crawler_v2(url, downloading_dict, path, level, exclusions, verbose=False):
    # let slow down a bit
    sleep(randint(MIN_DELAY, MAX_DELAY))

    url = parse.urlparse(url)
    print(url.geturl())

    try:
        response = requests.post(url.geturl(), data={})
        response_json = json.loads(response.text)
    except JSONDecodeError:
        print('- Data is missing! change a plan -')
        print('- > use terminal CURL            -')
        response = os.popen("curl {} -d ''".format(url.geturl())).read()
        response_json = json.loads(response)
    else:
        print('Nah, something went wrong!')
        return []

    files_dict = list(find('files', response_json))[0]

    for file in files_dict:
        name = file['name']

        # if @name contains exclusion word, we ignore
        if check_exclusion(name, exclusions):
            continue

        if 'folder' in file['mimeType']:
            next_url = url.geturl() + parse.quote(name) + "/"
            next_path = os.path.join(path, name)
            downloading_dict = crawler_v2(next_url, downloading_dict, next_path, level + 1, exclusions, verbose)
        else:
            name = file['name']
            if verbose:
                print('  ' + name)
            downloading_dict.append({
                'folder': path,
                'filename': name,
                'filename_abs': os.path.join(path, name),
                'size': file['size'],
                'url': url.geturl() + parse.quote(name),
            })

    # print(json.dumps(downloading_dict, indent=2), end='\n\n')
    return downloading_dict


def download_agent(task, OVERWRITE=OVERWRITE):
    # Making multiple requests too quick can cause yourself banned, so let set random delay (1, 10)
    sleep(randint(MIN_DELAY, MAX_DELAY))

    folder = task['folder']
    filename_abs = task['filename_abs']
    url = task['url']

    Path(folder).mkdir(parents=True, exist_ok=True)
    if os.path.exists(filename_abs) or OVERWRITE:
        r = requests.get(url, stream=True)
        if r.status_code is not 200:
            return 1
        with open(filename_abs, 'ab+') as f:
            f.write(r.content)
    return 0

In [0]:
MAX_DOWNLOAD_TASKS = 4
exclusions = ['__MACOSX/']

destination = "/content/drive/My Drive/Knowledge/_Trainings/_download"
download_tasks = [
    {
        'folder': 'FrontEndMasters - Complete Intro to Containers',
        'url': 'https://tutnetflix.mlwdl.workers.dev/FrontEndMasters%20-%20Complete%20Intro%20to%20Containers/'
    },
    {
        'folder': 'test',
        'url': 'https://gdrv.icu/0:/Star/宮崎あや/2013-2016/'
    },
]

print('##################################')
print('# Crawling all downloadable urls #')
print('##################################', end='\n\n')
tasks = []
for task in download_tasks:
    tasks += crawler_v2(task['url'], [], os.path.join(destination, task['folder']), 0, exclusions, verbose=False)
    # print(json.dumps(tasks, indent=2), end='\n\n')

# print(json.dumps(tasks, indent=2))
print('\nCollecting', len(tasks), 'is completed', end='\n\n')



In [0]:
print('##################################')
print('# Downloading files and folders  #')
print('##################################', end='\n\n')
pool = multiprocessing.Pool(processes=MAX_DOWNLOAD_TASKS)  # Num of CPUs

failures = 0
tasks_list = [task.get('filename') for task in tasks]
with tqdm.tqdm(total=len(tasks)) as pbar:
    for i, result in enumerate(pool.imap_unordered(download_agent, tasks)):
        pbar.set_description('Downloading %s' % tasks_list[i])
        failures += result
        pbar.update()
print('\nTotal number of download failures:', failures)
pool.close()
pool.terminate()

print('\nAll done, Voila!')