## Recursive GoIndex Downloader by atlonxp

This code was created and improved by adapting the code from pankaj260 https://colab.research.google.com/drive/1tmsLGuswIZIZ_oM35EMW8TbJ6pQPt1rY#scrollTo=3bCnUMUg_SoT&forceEdit=true&sandboxMode=true

**Features**
*   Recursive crawler (atlonxp)
*   Download all folders and files in a given url (atlonxp)
*   Download all folders and files in in sub-folders (atlonxp)
*   Adaptive delay in fetching url (atlonxp)
*   Store folders/files directly to your Google Drive (pankaj260)
*   Folders and files exclusion filters
*   Download queue supported
*   Auto-domain URL detection

**Coming soon**
-   TDQM multiple/paralelled downloader
-   Aria2 integration


**Changelogs**

15 April 2020:
-   Added auto-domain URL detection
-   Added simple download queue

14 April 2020:
-   initial



In [0]:
# Mounting Google Drive, ignore this section if you don't want to 
# save on your Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Install dependencies

!apt-get update
!apt install chromium-chromedriver
!pip install selenium bs4 requests

In [0]:
# Import dependencies

import os
import re

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait

In [0]:
# Functions

def get_page_source(url, browser, delay, level, verbose=False):
    try:
        if verbose:
            print('\t' * level, url)
        browser.get(url)
        list_elm = WebDriverWait(browser, delay).until(lambda x: x.find_element_by_xpath('//*[@id="list"]/li[1]'))
        return browser.page_source
    except TimeoutException:
        raise Exception('Unable to load this website')


def check_exclusion(url, exclusions):
    for exc in exclusions:
        if exc in url:
            return True
    return False


def crawler(url, downloading_dict, level, exclusions, max_delay=60, verbose=False):
    domain_match = re.search('(https?://[A-Za-z_0-9.-]+).*', url)
    if not domain_match:
        raise Exception('Wrong format URL')
      
    domain_url = domain_match.group(1)

    page_source = get_page_source(url, browser, max_delay, level, verbose)
    soup = BeautifulSoup(page_source, 'html.parser')

    
    for fileFolders in soup.select('li.mdui-list-item a'):
        if (fileFolders.contents[1].text.find("folder_open") > -1):
            tempFolderName = fileFolders.contents[1].text
            folderName = tempFolderName.replace("folder_open", "").strip().strip().replace("/", "")
            folder_url = domain_url + fileFolders['href']

            if not check_exclusion(folder_url, exclusions):
                downloading_dict[folderName] = {
                    'type': 'folder',
                    'url': folder_url,
                    'child': crawler(folder_url, {}, level + 1, exclusions)
                }
        else:
            tempFileName = fileFolders.contents[1].text
            fileName = tempFileName.replace("insert_drive_file", "").strip().replace("/", "")
            file_url = domain_url + fileFolders['href'].replace("?a=view", "")

            if not check_exclusion(file_url, exclusions):
                downloading_dict[fileName] = {
                    'type': 'file',
                    'url': file_url
                }

    return downloading_dict


def downloader(downloading_dict, level, path, verbose=False):
    try:
        if not os.path.exists(path):
            os.mkdir(path)
    except Exception:
        pass

    for key, value in downloading_dict.items():
        if value['type'] == 'folder':
            path_abs_dir = os.path.join(path, key)
            if verbose:
                print('\t' * level, path_abs_dir)

            try:
                if not os.path.exists(path_abs_dir):
                    os.mkdir(path_abs_dir)
            except Exception:
                pass
            downloader(value['child'], level + 1, path_abs_dir, verbose)
        else:
            file_abs_path = os.path.join(path, key)
            if verbose:
                print('\t' * level, file_abs_path)
            if os.path.exists(file_abs_path):
                print("skipping => " + os.path.join(path, key))
            else:
                r = requests.get(value['url'], stream=True)
                with open(file_abs_path, "ab+") as f:
                    f.write(r.content)

In [0]:
# Browser Initialization

chromedriver = "/usr/bin/chromedriver"
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
browser = webdriver.Chrome(chromedriver, options=chrome_options)

In [0]:
exclusions = ['__MACOSX/']
max_waiting_delay = 60

destination = "/content/drive/My Drive/Knowledge/_Trainings/_download/"
download_tasks = [
    {
        'folder': 'ABC',
        'url': 'https://xxx.workers.dev/abc/'
    },
    {
        'folder': 'DEF',
        'url': 'https://xxx.workers.dev/def/'
    },
]

for task in download_tasks:
    print('Task: ', task['folder'])
    downloading_dict = crawler(task['url'], {}, 0, exclusions, verbose=True)
    downloader(downloading_dict, 0, path=os.path.join(destination, task['folder']), verbose=True)
    print('Task completed --------------------', end='\n\n')

print('All done, Voila!')