In [2]:
import os
import hashlib
import shutil

from glob import glob

from io import BytesIO

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

import requests
import imagehash
from PIL import Image

from tqdm.auto import tqdm


In [3]:
def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [4]:
def calculate_phashes(image_stream):
    img = Image.open(image_stream)

    dhash = str(imagehash.dhash(img))
    phash = str(imagehash.phash(img))
    whash = str(imagehash.whash(img))

    return {
        "dhash": dhash,
        "phash": phash,
        "whash": whash,
    }


def calculate_hashes(image_bytes):
    stream = BytesIO(image_bytes)
    hashes = calculate_phashes(stream)

    md5 = hashlib.md5(image_bytes).hexdigest()
    hashes["md5"] = md5

    return hashes

In [5]:
!mkdir images

A subdirectory or file images already exists.


In [6]:
images = sorted(glob('images/*'))

In [7]:
def read_and_calculate_hashes(filename):
    with open(filename, 'rb') as f_in:
        content = f_in.read()
    h = calculate_hashes(content)
    return (filename, h)

In [8]:
with ProcessPoolExecutor() as pool:
    hashes = map_progress(pool, images, read_and_calculate_hashes)

0it [00:00, ?it/s]

In [9]:
md5s = {h['md5']: k for (k, h) in hashes}
phashes = {h['phash']: k for (k, h) in hashes}

In [10]:
blocked = set()

In [11]:
blocked

set()

In [12]:
token = 'AQAAAAASV-i8AACtpQpyXW6pOEN9tzTb_q-YA4k' #os.environ['TOLOKA_TOKEN']
host = 'https://toloka.dev'

headers = {
    'Authorization': 'OAuth %s' % token
}

In [13]:
def download_image_and_calc_hashes(attachment):
    try:
        filename = 'images/%s' % attachment

        if os.path.exists(filename):
            print('%s already exist' % filename)
            _, h = read_and_calculate_hashes(filename)
            return attachment, h

        print('downloading %s...' % filename)

        image_url_template = '%s/api/v1/attachments/%s/download'
        image_url = image_url_template % (host, attachment)
        res = requests.get(image_url, headers=headers)

        with open(filename, 'wb') as f_out:
            f_out.write(res.content)

        h = calculate_hashes(res.content)
        return (attachment, h)
    except OSError:
        return (attachment, None)

In [17]:
def reject(id, verdict):
    if verdict == 'accept':
        status = 'ACCEPTED'
        comment = 'thank you'
    else:
        status = 'REJECTED'
        comment = rejection_reasons[verdict]

    verdict_url_template = '%s/api/v1/assignments/%s'
    verdict_url = verdict_url_template % (host, id)

    request_body = {
        'status': status,
        'public_comment': comment
    }
    
    resp = requests.patch(verdict_url, headers=headers, json=request_body)
    print(id, status, comment)

def reject_tuple(expand):
    id, verdict = expand
    reject(id, verdict)
    
def reject_dupplicate(id):
    reject(id, 'duplicate')

## Start here

In [18]:
resp = requests.get(URL, headers=headers).json()
print(resp['has_more'])
print(len(resp['items']))

items = {d['id']: d for d in resp['items']}
users = {d['id']: d['user_id'] for d in resp['items']}

False
9


In [19]:
attachments = {v['solutions'][0]['output_values']['image']: k for k, v in items.items()}
with ThreadPoolExecutor() as pool:
    results = map_progress(pool, list(attachments), download_image_and_calc_hashes)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

downloading images/b6793b19-d070-4879-938a-e754549e5501...
downloading images/d0b0fc4d-cc2c-4abe-97cb-353423dc07d1...downloading images/167f23be-4615-498a-ace4-d0b0f64cc95f...

downloading images/78334dc5-2c60-48f8-a665-5d33e357c2d5...
downloading images/16740066-3329-4165-884a-25c42814d590...
downloading images/df545cf1-5461-4745-8e32-c7e6e1247d2e...
downloading images/d66fbc59-e189-47ff-b487-05d7716d75c0...
downloading images/a18f1511-770a-4900-b748-691c6021a3b4...
downloading images/03103065-f445-44a5-b707-53b73534f57d...



In [20]:
duplicates = []

for a, h in results:
    if h is None:
        continue
    md5 = h['md5']
    if md5 in md5s:
        item_id = attachments[a]
        print('duplicate!', item_id, a)
        print('original:', md5s[md5])
        duplicates.append((item_id, a))
        continue 

    md5s[md5] = a

duplicate_items = {i for (i, _) in duplicates}
duplicate_attachments = {i for (_, i) in duplicates}
duplicates_results = [(d, 'duplicate') for d in duplicate_items]

In [21]:
for a, h in results:
    if h is None:
        continue
    phash = h['phash']
    if a in duplicate_attachments:
        print('already md5 duplicate, skipping')
        continue

    if phash in phashes:
        item_id = attachments[a]
        print('phash duplicate!', item_id, a)
        print('candidate:', phashes[phash])
        print()
    else:
        phashes[phash] = a

In [22]:
items_pool = []
for item_id, item in items.items():
    if item_id not in duplicate_items:
        items_pool.append(item_id)
print(len(items_pool), len(items))

9 9


In [23]:
results = annotate(items_pool,
    options=options,
    include_skip=False,
    display_fn=show_item)

HTML(value='0 examples annotated, 10 examples left')

HBox(children=(Button(description='accept', style=ButtonStyle()), Button(description='stock', style=ButtonStyl…

Output()

Annotation done.


In [24]:
all_results = duplicates_results + results
Counter([v for (_, v) in all_results])

Counter({'accept': 8, 'bad_quality': 1})

In [25]:
with ThreadPoolExecutor() as pool:
    map_progress(pool, all_results, reject_tuple)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

0000edc476--5f2fe2ac64fc8d11aa14d064 ACCEPTED отлично, спасибо
0000edc476--5f2fda9f17028d7448a50a4f ACCEPTED отлично, спасибо
0000edc476--5f2fe1712a01ac6dd718ca37 ACCEPTED отлично, спасибо
0000edc476--5f2fe08007d73947a61890fe ACCEPTED отлично, спасибо
0000edc476--5f2fd9f094ee1b7a5d5cadb0 ACCEPTED отлично, спасибо
0000edc476--5f2fd9067b8dbe5e5bff8fac ACCEPTED отлично, спасибо
0000edc476--5f2fdc9122d6df43d6d3bfb6 ACCEPTED отлично, спасибо
0000edc476--5f2fdab64c55035903c5ccca REJECTED плохое качество, плохо видно одежду
0000edc476--5f2fe30fb6f8735169f96416 ACCEPTED отлично, спасибо



In [26]:
for id, verdict in tqdm(all_results):
    if verdict in {'stock', 'not clothes', 'duplicate'}:
        user_id = users[id]
        block_user(user_id, rejection_reasons[verdict])

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




    for id, verdict in results:
        if verdict != 'accept':
            continue

        item = items[id]
        output = item['solutions'][0]['output_values']
        attachment = output['image']

        src_file = 'images/%s' % attachment
        dest_file = 'accepted/%s' % attachment

        shutil.copyfile(src_file, dest_file)
        print('copied %s to %s, verdict: %s' % (src_file, dest_file, verdict))