#### Site to mine: https://www.mindat.org/photos/555/
#### Sample jpg: https://www.mindat.org/photos/555/01/0555010001421587987.jpg
#### HTTP Get Request has to have a referer to parent directory

In [2]:
import os
import tqdm
import requests
from requests_html import HTMLSession

Helper Functions

In [3]:
def printf(s):
    if DEBUG: print(s)

def loc_path(url): 
    """Local path without https url"""
    return url[len(ROOT_URL):] if IS_FOLDERS else url.split('/')[-1]

def mkdir_loc(url):
    """Create sub-folders in local directory."""
    if IS_FOLDERS and not os.path.exists(loc_path(url)):
        os.mkdir(loc_path(url))
        printf(f"Created folder at {loc_path(url)}")

def try_http_get(url, **kwargs):
    try:
        headers = kwargs.get('headers')
        r = sess.get(url, headers=headers)
        return r
    except requests.exceptions.RequestException as e:
        print(e, url)
        return None

In [4]:
### INPUTS ###
IS_FOLDERS = True
DEBUG = True
ROOT_URL = 'https://www.mindat.org/'
PARENT_DIR = 'photos/'
### END INPUTS ###

In [5]:
# create HTML session
sess = HTMLSession()

In [6]:
# first parent
p1 = ROOT_URL+PARENT_DIR
mkdir_loc(p1)
r = try_http_get(p1)
printf(f"parent url: {p1}, status code: {r.status_code}")

# first child
c1s = [p1+'{:03}/'.format(i) for i in range(1000)]
for c1 in tqdm.tqdm(c1s):
    mkdir_loc(c1)
    r = try_http_get(c1, headers={'referer': '/'.join(c1.split('/')[:-1])})
    if r is None: continue
    printf(f"first child url: {c1}, status code: {r.status_code}")

    # second child
    c2s = sorted([c1+u for u in list(r.html.links) if len(u) == 3])
    for c2 in tqdm.tqdm(c2s):
        mkdir_loc(c2)
        r = try_http_get(c2, headers={'referer': '/'.join(c2.split('/')[:-1])})
        if r is None: continue
        printf(f"second child url: {c2}, status code: {r.status_code}")
    
        # third child
        c3s = [c2+u for u in list(r.html.links) if '.jpg' in u or '.png' in u]
        for c3 in c3s:
            # get JPG HTML response
            r = try_http_get(c3, headers={'referer': '/'.join(c3.split('/')[:-1])})
            if r is None: continue
            printf(f"third child url: {c3}, status code: {r.status_code}")
            # save image
            if r.status_code == 200 and r.content != b'':
                jpg_path = loc_path(c3)
                with open(jpg_path, 'wb') as f:
                    f.write(r.html.raw_html)



parent url: https://www.mindat.org/photos/, status code: 200


  0%|                                                                                         | 0/1000 [00:00<?, ?it/s]

first child url: https://www.mindat.org/photos/000/, status code: 200



  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

second child url: https://www.mindat.org/photos/000/00/, status code: 200
third child url: https://www.mindat.org/photos/000/00/00000020014378957949659.jpg, status code: 200
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)) https://www.mindat.org/photos/000/00/00000020014460934567596.jpg
third child url: https://www.mindat.org/photos/000/00/0000007001315582593.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/00/0000005001177189069.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/00/0000002001318303526.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/00/00000040014610169264918.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/00/0000009001273905760.jpg, status code: 200



  1%|▊                                                                                 | 1/100 [00:05<09:25,  5.71s/it]

second child url: https://www.mindat.org/photos/000/01/, status code: 200
third child url: https://www.mindat.org/photos/000/01/00000190014921001422173.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/01/0000012001320406904.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/01/0000016001275557046.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/01/00000110014636885711132.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/01/0000015001239114257.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/01/0000013001218493133.jpg, status code: 200



  2%|█▋                                                                                | 2/100 [00:07<06:25,  3.93s/it]

second child url: https://www.mindat.org/photos/000/02/, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000024001302441715.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000022001416296126.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000024001220979199.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000026001175593386.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000027001183648237.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000021001328064867.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000025001307052836.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/02/0000022001252999378.jpg, status code: 200



  3%|██▍                                                                               | 3/100 [00:10<05:52,  3.64s/it]

second child url: https://www.mindat.org/photos/000/03/, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000031001254277428.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000036001219518888.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000035001201636084.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000036001420862967.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000030001324990882.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000039001347316317.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000031001396369388.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000036001431845590.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/03/0000034001228415823.jpg, status code: 200



  4%|███▎                                                                              | 4/100 [00:13<05:32,  3.46s/it]

second child url: https://www.mindat.org/photos/000/04/, status code: 200
third child url: https://www.mindat.org/photos/000/04/0000044001019964891.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/04/00000410015190648377627.jpg, status code: 200
third child url: https://www.mindat.org/photos/000/04/00000420014894491792096.jpg, status code: 200


KeyboardInterrupt: 