In [1]:
import pandas as pd
from tqdm.auto import tqdm

from pathlib import Path
import requests

In [4]:
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(12)

In [5]:
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [38]:
def download_and_save(record):
    image_url = record['image_url']
    label = record['label']
    
    filename = image_url.split('/')[-1]
    fullpath = Path('data') / label / filename

    fullpath.parent.mkdir(parents=True, exist_ok=True)

    if fullpath.exists():
        return fullpath

    image_resp = requests.get(image_url)
    
    with fullpath.open(mode='wb') as f_out:
        f_out.write(image_resp.content)
    
    return fullpath

In [33]:
df = pd.read_csv('data-all.csv')

False

In [34]:
len(df)

8214

In [39]:
records = df.to_dict(orient='records')
paths = map_progress(pool, records, download_and_save)

  0%|          | 0/8214 [00:00<?, ?it/s]

In [40]:
paths[16]

WindowsPath('data/plate/eu.a3da0582-366d-4428-a595-fdd750be7562.jpg')

In [41]:
from glob import glob

In [46]:
files = sorted(Path('data').glob('*/*'))

('data', 'cup', 'eu.0016a837-6584-4276-82b3-211a053e84f8.jpg')

In [50]:
import random

In [67]:
records = []

for file in files:
    parts = file.parts
    
    label = parts[1]
    name = parts[-1].split('.')[1]

    h = hash(name) % 10
    records.append((name, label, h >= 7))

In [71]:
df_final = pd.DataFrame(records, columns=['file', 'label', 'test'])

In [72]:
df_final.to_csv('final-data.csv', index=False)

In [73]:
df_train = df_final[df_final.test == False]
del df_train['test']
df_train.to_csv('train.csv')

df_test = df_final[df_final.test == True]
del df_test['test']
df_test.to_csv('test.csv')
