In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 913 kB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 1.8 MB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.2.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 338 kB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.1.0-py3-none-any.whl size=121000 sha256=ed4bed5b133dae265281ea326437b9b0e0c6ae90a27a80782faf64ef3ad9e93b
  Stored in directory: /root/.cache/pip/wheels/90/f0/bb/ed1add84da70092ea526466eadc2bfb197c4bcb8d4fa5f7bad
Successfully built sentence-transformers
Installing collected packages: huggingface-hub, 

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from collections import Counter

import gc
import cloudpickle

import os, sys
import random

import warnings

import xgboost as xgb


import base64
import io

import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from PIL import Image

from tqdm import tqdm

print('Using \033[34mXGBoost', xgb.__version__, '\033[0m')

Using [34mXGBoost 1.5.0 [0m


In [3]:
model = SentenceTransformer('clip-ViT-B-32')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/605M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/961k [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


In [4]:
def transform2image_embeddings(df):
    image2text_embeddings = []
    widths = []
    heights = []

    for k in tqdm(range(df.shape[0])):
        data = df.loc[k, 1]
        
        try:
            image = base64.b64decode(data)
            img = Image.open(io.BytesIO(image)).convert('RGB')
        
            width, height = img.size
            widths.append(width)
            heights.append(height)

            image2text_embeddings.append(model.encode(img, show_progress_bar=False))
        except:
            print(k)
            widths.append(np.nan)
            heights.append(np.nan)
            image2text_embeddings.append(np.full(512, np.nan))
            
        
    image2text_embeddings = np.vstack(image2text_embeddings)
    
    embeddings_df = pd.DataFrame(image2text_embeddings)
    embeddings_df.insert(loc=0, column='image_id', value=df['image_id'])
    
    size_df = pd.DataFrame({'image_id': df['image_id'], 'width': widths, 'height': heights})
    
    return embeddings_df, size_df

In [5]:
import requests


def prepare_train_embeddings(images_path, indices):
    images = pd.read_csv(images_path)
    images['image_id'] = images.index
    
    container = []
    sizes = []
    
    found = 0
    progress_bar = indices

    for index in progress_bar:
        print('\033[34mHandling part {}...\033[0m'.format(index))
        url = 'https://analytics.wikimedia.org/published/datasets/one-off/caption_competition/training/image_pixels/part-00{0:03d}-04b253b8-db8c-4d14-a23f-3433a86841b4-c000.csv.gz'.format(index)
        r = requests.get(url, allow_redirects=True)

        with open('pixels.csv', 'wb') as file:
            file.write(r.content)

        df = pd.read_csv('pixels.csv', compression='gzip', sep='\t', header=None)
        df.columns = ['image_url', 1, 2]
        df = pd.merge(images, df, on='image_url')
        df, size_df = transform2image_embeddings(df)
        
        gc.collect()
        container.append(df)
        sizes.append(size_df)
        
        found += df.shape[0]
#         progress_bar.set_description('Found: {0:.2f}'.format(found / images.shape[0]))

        print('found: {0:.2f}'.format(found / images.shape[0]))
        print()
        
        if found == images.shape[0]:
            break
    
    container = pd.concat(container)
    sizes = pd.concat(sizes)
    
    temp = images['image_id']
    
    container = pd.merge(temp, container, on='image_id', how='left').sort_values(by='image_id')
    container.fillna(np.float32(1.), inplace=True)
    
    sizes = pd.merge(temp, sizes, on='image_id', how='left').sort_values(by='image_id')

    return container, sizes

In [6]:
%%time

embeddings, sizes = prepare_train_embeddings('../input/final-image-traindataset-part0-4-count-5/final_images.csv', range(200))

[34mHandling part 0...[0m


100%|██████████| 317/317 [01:02<00:00,  5.10it/s]


found: 0.00

[34mHandling part 1...[0m


100%|██████████| 331/331 [01:03<00:00,  5.17it/s]


found: 0.01

[34mHandling part 2...[0m


100%|██████████| 341/341 [01:04<00:00,  5.30it/s]


found: 0.01

[34mHandling part 3...[0m


100%|██████████| 339/339 [01:06<00:00,  5.09it/s]


found: 0.02

[34mHandling part 4...[0m


100%|██████████| 336/336 [01:04<00:00,  5.24it/s]


found: 0.02

[34mHandling part 5...[0m


100%|██████████| 351/351 [01:06<00:00,  5.25it/s]


found: 0.03

[34mHandling part 6...[0m


100%|██████████| 331/331 [01:02<00:00,  5.31it/s]


found: 0.03

[34mHandling part 7...[0m


100%|██████████| 334/334 [01:03<00:00,  5.25it/s]


found: 0.04

[34mHandling part 8...[0m


100%|██████████| 318/318 [01:00<00:00,  5.25it/s]


found: 0.04

[34mHandling part 9...[0m


100%|██████████| 331/331 [01:04<00:00,  5.13it/s]


found: 0.05

[34mHandling part 10...[0m


100%|██████████| 326/326 [01:02<00:00,  5.23it/s]


found: 0.05

[34mHandling part 11...[0m


100%|██████████| 343/343 [01:05<00:00,  5.24it/s]


found: 0.06

[34mHandling part 12...[0m


100%|██████████| 341/341 [01:05<00:00,  5.20it/s]


found: 0.06

[34mHandling part 13...[0m


100%|██████████| 362/362 [01:09<00:00,  5.21it/s]


found: 0.07

[34mHandling part 14...[0m


100%|██████████| 340/340 [01:09<00:00,  4.90it/s]


found: 0.07

[34mHandling part 15...[0m


100%|██████████| 332/332 [01:03<00:00,  5.21it/s]


found: 0.08

[34mHandling part 16...[0m


100%|██████████| 323/323 [01:01<00:00,  5.23it/s]


found: 0.08

[34mHandling part 17...[0m


100%|██████████| 356/356 [01:08<00:00,  5.17it/s]


found: 0.09

[34mHandling part 18...[0m


100%|██████████| 312/312 [01:00<00:00,  5.17it/s]


found: 0.09

[34mHandling part 19...[0m


100%|██████████| 314/314 [01:04<00:00,  4.84it/s]


found: 0.10

[34mHandling part 20...[0m


100%|██████████| 353/353 [01:08<00:00,  5.17it/s]


found: 0.10

[34mHandling part 21...[0m


100%|██████████| 373/373 [01:12<00:00,  5.13it/s]


found: 0.11

[34mHandling part 22...[0m


100%|██████████| 305/305 [00:58<00:00,  5.18it/s]


found: 0.11

[34mHandling part 23...[0m


100%|██████████| 309/309 [01:00<00:00,  5.11it/s]


found: 0.12

[34mHandling part 24...[0m


100%|██████████| 332/332 [01:04<00:00,  5.15it/s]


found: 0.12

[34mHandling part 25...[0m


100%|██████████| 286/286 [00:55<00:00,  5.16it/s]


found: 0.12

[34mHandling part 26...[0m


100%|██████████| 352/352 [01:21<00:00,  4.35it/s]


found: 0.13

[34mHandling part 27...[0m


100%|██████████| 303/303 [00:59<00:00,  5.10it/s]


found: 0.13

[34mHandling part 28...[0m


100%|██████████| 338/338 [01:05<00:00,  5.14it/s]


found: 0.14

[34mHandling part 29...[0m


100%|██████████| 300/300 [00:58<00:00,  5.12it/s]


found: 0.14

[34mHandling part 30...[0m


100%|██████████| 333/333 [01:04<00:00,  5.18it/s]


found: 0.15

[34mHandling part 31...[0m


100%|██████████| 336/336 [01:05<00:00,  5.12it/s]


found: 0.15

[34mHandling part 32...[0m


100%|██████████| 306/306 [00:59<00:00,  5.15it/s]


found: 0.16

[34mHandling part 33...[0m


100%|██████████| 327/327 [01:07<00:00,  4.85it/s]


found: 0.16

[34mHandling part 34...[0m


100%|██████████| 331/331 [01:04<00:00,  5.16it/s]


found: 0.17

[34mHandling part 35...[0m


100%|██████████| 338/338 [01:05<00:00,  5.13it/s]


found: 0.17

[34mHandling part 36...[0m


100%|██████████| 347/347 [01:07<00:00,  5.12it/s]


found: 0.18

[34mHandling part 37...[0m


100%|██████████| 322/322 [01:03<00:00,  5.10it/s]


found: 0.18

[34mHandling part 38...[0m


100%|██████████| 342/342 [01:14<00:00,  4.56it/s]


found: 0.19

[34mHandling part 39...[0m


100%|██████████| 304/304 [00:59<00:00,  5.08it/s]


found: 0.19

[34mHandling part 40...[0m


100%|██████████| 342/342 [01:07<00:00,  5.08it/s]


found: 0.20

[34mHandling part 41...[0m


100%|██████████| 326/326 [01:03<00:00,  5.09it/s]


found: 0.20

[34mHandling part 42...[0m


100%|██████████| 296/296 [00:58<00:00,  5.04it/s]


found: 0.20

[34mHandling part 43...[0m


100%|██████████| 354/354 [01:10<00:00,  5.00it/s]


found: 0.21

[34mHandling part 44...[0m


100%|██████████| 304/304 [01:01<00:00,  4.98it/s]


found: 0.21

[34mHandling part 45...[0m


100%|██████████| 317/317 [01:03<00:00,  4.97it/s]


found: 0.22

[34mHandling part 46...[0m


100%|██████████| 320/320 [01:03<00:00,  5.05it/s]


found: 0.22

[34mHandling part 47...[0m


100%|██████████| 305/305 [01:01<00:00,  4.99it/s]


found: 0.23

[34mHandling part 48...[0m


100%|██████████| 361/361 [01:10<00:00,  5.13it/s]


found: 0.23

[34mHandling part 49...[0m


100%|██████████| 367/367 [01:12<00:00,  5.08it/s]


found: 0.24

[34mHandling part 50...[0m


100%|██████████| 342/342 [01:13<00:00,  4.68it/s]


found: 0.24

[34mHandling part 51...[0m


100%|██████████| 363/363 [01:11<00:00,  5.08it/s]


found: 0.25

[34mHandling part 52...[0m


100%|██████████| 328/328 [01:09<00:00,  4.69it/s]


found: 0.25

[34mHandling part 53...[0m


100%|██████████| 322/322 [01:02<00:00,  5.16it/s]


found: 0.26

[34mHandling part 54...[0m


100%|██████████| 320/320 [01:02<00:00,  5.13it/s]


found: 0.26

[34mHandling part 55...[0m


100%|██████████| 339/339 [01:05<00:00,  5.15it/s]


found: 0.27

[34mHandling part 56...[0m


100%|██████████| 311/311 [01:00<00:00,  5.12it/s]


found: 0.27

[34mHandling part 57...[0m


100%|██████████| 334/334 [01:09<00:00,  4.83it/s]


found: 0.28

[34mHandling part 58...[0m


100%|██████████| 353/353 [01:08<00:00,  5.16it/s]


found: 0.28

[34mHandling part 59...[0m


100%|██████████| 330/330 [01:04<00:00,  5.13it/s]


found: 0.29

[34mHandling part 60...[0m


100%|██████████| 312/312 [01:00<00:00,  5.17it/s]


found: 0.29

[34mHandling part 61...[0m


100%|██████████| 306/306 [00:59<00:00,  5.14it/s]


found: 0.29

[34mHandling part 62...[0m


100%|██████████| 303/303 [01:01<00:00,  4.90it/s]


found: 0.30

[34mHandling part 63...[0m


100%|██████████| 328/328 [01:07<00:00,  4.88it/s]


found: 0.30

[34mHandling part 64...[0m


100%|██████████| 317/317 [01:04<00:00,  4.89it/s]


found: 0.31

[34mHandling part 65...[0m


100%|██████████| 353/353 [01:09<00:00,  5.07it/s]


found: 0.31

[34mHandling part 66...[0m


100%|██████████| 337/337 [01:13<00:00,  4.58it/s]


found: 0.32

[34mHandling part 67...[0m


100%|██████████| 315/315 [01:01<00:00,  5.15it/s]


found: 0.32

[34mHandling part 68...[0m


100%|██████████| 314/314 [01:01<00:00,  5.07it/s]


found: 0.33

[34mHandling part 69...[0m


100%|██████████| 311/311 [01:06<00:00,  4.66it/s]


found: 0.33

[34mHandling part 70...[0m


100%|██████████| 352/352 [01:09<00:00,  5.08it/s]


found: 0.34

[34mHandling part 71...[0m


100%|██████████| 327/327 [01:09<00:00,  4.70it/s]


found: 0.34

[34mHandling part 72...[0m


100%|██████████| 320/320 [01:02<00:00,  5.12it/s]


found: 0.35

[34mHandling part 73...[0m


100%|██████████| 305/305 [01:00<00:00,  5.06it/s]


found: 0.35

[34mHandling part 74...[0m


100%|██████████| 351/351 [01:08<00:00,  5.11it/s]


found: 0.36

[34mHandling part 75...[0m


100%|██████████| 336/336 [01:06<00:00,  5.07it/s]


found: 0.36

[34mHandling part 76...[0m


100%|██████████| 315/315 [01:01<00:00,  5.13it/s]


found: 0.36

[34mHandling part 77...[0m


  "Palette images with Transparency expressed in bytes should be "
100%|██████████| 305/305 [01:00<00:00,  5.07it/s]


found: 0.37

[34mHandling part 78...[0m


100%|██████████| 317/317 [01:02<00:00,  5.04it/s]


found: 0.37

[34mHandling part 79...[0m


100%|██████████| 317/317 [01:02<00:00,  5.09it/s]


found: 0.38

[34mHandling part 80...[0m


100%|██████████| 337/337 [01:06<00:00,  5.04it/s]


found: 0.38

[34mHandling part 81...[0m


100%|██████████| 329/329 [01:05<00:00,  5.06it/s]


found: 0.39

[34mHandling part 82...[0m


100%|██████████| 353/353 [01:09<00:00,  5.07it/s]


found: 0.39

[34mHandling part 83...[0m


100%|██████████| 338/338 [01:06<00:00,  5.08it/s]


found: 0.40

[34mHandling part 84...[0m


100%|██████████| 316/316 [01:14<00:00,  4.22it/s]


found: 0.40

[34mHandling part 85...[0m


100%|██████████| 326/326 [01:09<00:00,  4.72it/s]


found: 0.41

[34mHandling part 86...[0m


100%|██████████| 343/343 [01:07<00:00,  5.08it/s]


found: 0.41

[34mHandling part 87...[0m


100%|██████████| 339/339 [01:06<00:00,  5.10it/s]


found: 0.42

[34mHandling part 88...[0m


100%|██████████| 314/314 [01:01<00:00,  5.08it/s]


found: 0.42

[34mHandling part 89...[0m


100%|██████████| 297/297 [00:58<00:00,  5.04it/s]


found: 0.43

[34mHandling part 90...[0m


100%|██████████| 336/336 [01:15<00:00,  4.46it/s]


found: 0.43

[34mHandling part 91...[0m


100%|██████████| 339/339 [01:07<00:00,  5.03it/s]


found: 0.44

[34mHandling part 92...[0m


100%|██████████| 345/345 [01:08<00:00,  5.06it/s]


found: 0.44

[34mHandling part 93...[0m


100%|██████████| 327/327 [01:04<00:00,  5.08it/s]


found: 0.45

[34mHandling part 94...[0m


100%|██████████| 317/317 [01:03<00:00,  5.01it/s]


found: 0.45

[34mHandling part 95...[0m


100%|██████████| 331/331 [01:05<00:00,  5.05it/s]


found: 0.45

[34mHandling part 96...[0m


100%|██████████| 318/318 [01:03<00:00,  5.01it/s]


found: 0.46

[34mHandling part 97...[0m


100%|██████████| 322/322 [01:03<00:00,  5.06it/s]


found: 0.46

[34mHandling part 98...[0m


100%|██████████| 336/336 [01:06<00:00,  5.05it/s]


found: 0.47

[34mHandling part 99...[0m


100%|██████████| 328/328 [01:28<00:00,  3.70it/s]


found: 0.47

[34mHandling part 100...[0m


100%|██████████| 335/335 [01:06<00:00,  5.05it/s]


found: 0.48

[34mHandling part 101...[0m


100%|██████████| 336/336 [01:07<00:00,  5.01it/s]


found: 0.48

[34mHandling part 102...[0m


100%|██████████| 340/340 [01:07<00:00,  5.02it/s]


found: 0.49

[34mHandling part 103...[0m


100%|██████████| 317/317 [01:03<00:00,  5.03it/s]


found: 0.49

[34mHandling part 104...[0m


100%|██████████| 366/366 [01:15<00:00,  4.87it/s]


found: 0.50

[34mHandling part 105...[0m


100%|██████████| 337/337 [01:07<00:00,  4.99it/s]


found: 0.50

[34mHandling part 106...[0m


100%|██████████| 299/299 [01:00<00:00,  4.98it/s]


found: 0.51

[34mHandling part 107...[0m


100%|██████████| 314/314 [01:03<00:00,  4.98it/s]


found: 0.51

[34mHandling part 108...[0m


100%|██████████| 336/336 [01:05<00:00,  5.12it/s]


found: 0.52

[34mHandling part 109...[0m


100%|██████████| 328/328 [01:02<00:00,  5.27it/s]


found: 0.52

[34mHandling part 110...[0m


100%|██████████| 352/352 [01:06<00:00,  5.28it/s]


found: 0.53

[34mHandling part 111...[0m


100%|██████████| 343/343 [01:04<00:00,  5.31it/s]


found: 0.53

[34mHandling part 112...[0m


100%|██████████| 384/384 [01:16<00:00,  5.00it/s]


found: 0.54

[34mHandling part 113...[0m


100%|██████████| 316/316 [01:10<00:00,  4.48it/s]


found: 0.54

[34mHandling part 114...[0m


100%|██████████| 350/350 [01:10<00:00,  4.95it/s]


found: 0.55

[34mHandling part 115...[0m


100%|██████████| 301/301 [01:00<00:00,  4.96it/s]


found: 0.55

[34mHandling part 116...[0m


100%|██████████| 320/320 [01:04<00:00,  4.99it/s]


found: 0.55

[34mHandling part 117...[0m


100%|██████████| 328/328 [01:05<00:00,  4.97it/s]


found: 0.56

[34mHandling part 118...[0m


100%|██████████| 342/342 [01:11<00:00,  4.76it/s]


found: 0.56

[34mHandling part 119...[0m


100%|██████████| 312/312 [01:03<00:00,  4.92it/s]


found: 0.57

[34mHandling part 120...[0m


100%|██████████| 315/315 [01:03<00:00,  4.93it/s]


found: 0.57

[34mHandling part 121...[0m


100%|██████████| 317/317 [01:04<00:00,  4.93it/s]


found: 0.58

[34mHandling part 122...[0m


100%|██████████| 312/312 [01:03<00:00,  4.93it/s]


found: 0.58

[34mHandling part 123...[0m


100%|██████████| 319/319 [01:04<00:00,  4.92it/s]


found: 0.59

[34mHandling part 124...[0m


100%|██████████| 328/328 [01:04<00:00,  5.09it/s]


found: 0.59

[34mHandling part 125...[0m


100%|██████████| 316/316 [01:00<00:00,  5.24it/s]


found: 0.60

[34mHandling part 126...[0m


100%|██████████| 318/318 [01:07<00:00,  4.71it/s]


found: 0.60

[34mHandling part 127...[0m


100%|██████████| 305/305 [01:09<00:00,  4.36it/s]


found: 0.61

[34mHandling part 128...[0m


100%|██████████| 330/330 [01:07<00:00,  4.87it/s]


found: 0.61

[34mHandling part 129...[0m


100%|██████████| 363/363 [01:15<00:00,  4.78it/s]


found: 0.62

[34mHandling part 130...[0m


100%|██████████| 325/325 [01:17<00:00,  4.20it/s]


found: 0.62

[34mHandling part 131...[0m


100%|██████████| 321/321 [01:07<00:00,  4.77it/s]


found: 0.62

[34mHandling part 132...[0m


100%|██████████| 299/299 [01:03<00:00,  4.74it/s]


found: 0.63

[34mHandling part 133...[0m


100%|██████████| 322/322 [01:07<00:00,  4.79it/s]


found: 0.63

[34mHandling part 134...[0m


100%|██████████| 350/350 [01:11<00:00,  4.89it/s]


found: 0.64

[34mHandling part 135...[0m


100%|██████████| 307/307 [01:02<00:00,  4.88it/s]


found: 0.64

[34mHandling part 136...[0m


100%|██████████| 325/325 [01:02<00:00,  5.21it/s]


found: 0.65

[34mHandling part 137...[0m


100%|██████████| 273/273 [00:51<00:00,  5.27it/s]


found: 0.65

[34mHandling part 138...[0m


100%|██████████| 334/334 [01:08<00:00,  4.85it/s]


found: 0.66

[34mHandling part 139...[0m


100%|██████████| 338/338 [01:08<00:00,  4.91it/s]


found: 0.66

[34mHandling part 140...[0m


100%|██████████| 337/337 [01:08<00:00,  4.90it/s]


found: 0.67

[34mHandling part 141...[0m


100%|██████████| 319/319 [01:12<00:00,  4.41it/s]


found: 0.67

[34mHandling part 142...[0m


100%|██████████| 324/324 [01:06<00:00,  4.87it/s]


found: 0.68

[34mHandling part 143...[0m


100%|██████████| 333/333 [01:08<00:00,  4.89it/s]


found: 0.68

[34mHandling part 144...[0m


100%|██████████| 320/320 [01:04<00:00,  4.93it/s]


found: 0.68

[34mHandling part 145...[0m


100%|██████████| 314/314 [01:22<00:00,  3.82it/s]


found: 0.69

[34mHandling part 146...[0m


100%|██████████| 284/284 [00:56<00:00,  4.99it/s]


found: 0.69

[34mHandling part 147...[0m


100%|██████████| 332/332 [01:02<00:00,  5.31it/s]


found: 0.70

[34mHandling part 148...[0m


100%|██████████| 334/334 [01:07<00:00,  4.98it/s]


found: 0.70

[34mHandling part 149...[0m


100%|██████████| 328/328 [01:07<00:00,  4.83it/s]


found: 0.71

[34mHandling part 150...[0m


100%|██████████| 317/317 [01:06<00:00,  4.75it/s]


found: 0.71

[34mHandling part 151...[0m


100%|██████████| 345/345 [01:11<00:00,  4.83it/s]


found: 0.72

[34mHandling part 152...[0m


100%|██████████| 327/327 [01:07<00:00,  4.82it/s]


found: 0.72

[34mHandling part 153...[0m


100%|██████████| 322/322 [01:07<00:00,  4.80it/s]


found: 0.73

[34mHandling part 154...[0m


100%|██████████| 344/344 [01:11<00:00,  4.82it/s]


found: 0.73

[34mHandling part 155...[0m


100%|██████████| 364/364 [01:22<00:00,  4.40it/s]


found: 0.74

[34mHandling part 156...[0m


100%|██████████| 346/346 [01:10<00:00,  4.88it/s]


found: 0.74

[34mHandling part 157...[0m


100%|██████████| 307/307 [01:04<00:00,  4.76it/s]


found: 0.75

[34mHandling part 158...[0m


100%|██████████| 349/349 [01:21<00:00,  4.27it/s]


found: 0.75

[34mHandling part 159...[0m


100%|██████████| 321/321 [01:01<00:00,  5.22it/s]


found: 0.76

[34mHandling part 160...[0m


100%|██████████| 283/283 [00:53<00:00,  5.29it/s]


found: 0.76

[34mHandling part 161...[0m


100%|██████████| 321/321 [01:06<00:00,  4.82it/s]


found: 0.76

[34mHandling part 162...[0m


100%|██████████| 316/316 [01:05<00:00,  4.83it/s]


found: 0.77

[34mHandling part 163...[0m


100%|██████████| 313/313 [01:04<00:00,  4.82it/s]


found: 0.77

[34mHandling part 164...[0m


100%|██████████| 353/353 [01:18<00:00,  4.51it/s]


found: 0.78

[34mHandling part 165...[0m


100%|██████████| 326/326 [01:07<00:00,  4.82it/s]


found: 0.78

[34mHandling part 166...[0m


100%|██████████| 349/349 [01:11<00:00,  4.90it/s]


found: 0.79

[34mHandling part 167...[0m


100%|██████████| 320/320 [01:06<00:00,  4.84it/s]


found: 0.79

[34mHandling part 168...[0m


100%|██████████| 330/330 [01:05<00:00,  5.07it/s]


found: 0.80

[34mHandling part 169...[0m


100%|██████████| 337/337 [01:04<00:00,  5.20it/s]


found: 0.80

[34mHandling part 170...[0m


100%|██████████| 337/337 [01:04<00:00,  5.25it/s]


found: 0.81

[34mHandling part 171...[0m


100%|██████████| 303/303 [01:05<00:00,  4.63it/s]


found: 0.81

[34mHandling part 172...[0m


100%|██████████| 330/330 [01:09<00:00,  4.74it/s]


found: 0.82

[34mHandling part 173...[0m


100%|██████████| 331/331 [01:08<00:00,  4.84it/s]


found: 0.82

[34mHandling part 174...[0m


100%|██████████| 313/313 [01:05<00:00,  4.81it/s]


found: 0.83

[34mHandling part 175...[0m


100%|██████████| 333/333 [01:05<00:00,  5.07it/s]


found: 0.83

[34mHandling part 176...[0m


100%|██████████| 334/334 [01:02<00:00,  5.36it/s]


found: 0.84

[34mHandling part 177...[0m


100%|██████████| 322/322 [01:06<00:00,  4.85it/s]


found: 0.84

[34mHandling part 178...[0m


100%|██████████| 338/338 [01:21<00:00,  4.16it/s]


found: 0.84

[34mHandling part 179...[0m


100%|██████████| 311/311 [01:06<00:00,  4.65it/s]


found: 0.85

[34mHandling part 180...[0m


100%|██████████| 346/346 [01:11<00:00,  4.86it/s]


found: 0.85

[34mHandling part 181...[0m


100%|██████████| 326/326 [01:06<00:00,  4.88it/s]


found: 0.86

[34mHandling part 182...[0m


100%|██████████| 315/315 [01:03<00:00,  4.95it/s]


found: 0.86

[34mHandling part 183...[0m


100%|██████████| 301/301 [00:55<00:00,  5.43it/s]


found: 0.87

[34mHandling part 184...[0m


100%|██████████| 321/321 [01:06<00:00,  4.80it/s]


found: 0.87

[34mHandling part 185...[0m


100%|██████████| 345/345 [01:11<00:00,  4.83it/s]


found: 0.88

[34mHandling part 186...[0m


100%|██████████| 330/330 [01:08<00:00,  4.85it/s]


found: 0.88

[34mHandling part 187...[0m


100%|██████████| 326/326 [01:07<00:00,  4.80it/s]


found: 0.89

[34mHandling part 188...[0m


100%|██████████| 342/342 [01:07<00:00,  5.08it/s]


found: 0.89

[34mHandling part 189...[0m


100%|██████████| 337/337 [01:02<00:00,  5.36it/s]


found: 0.90

[34mHandling part 190...[0m


100%|██████████| 313/313 [01:05<00:00,  4.77it/s]


found: 0.90

[34mHandling part 191...[0m


100%|██████████| 308/308 [01:32<00:00,  3.32it/s]


found: 0.91

[34mHandling part 192...[0m


100%|██████████| 320/320 [01:04<00:00,  5.00it/s]


found: 0.91

[34mHandling part 193...[0m


100%|██████████| 311/311 [01:05<00:00,  4.76it/s]


found: 0.91

[34mHandling part 194...[0m


100%|██████████| 335/335 [01:02<00:00,  5.36it/s]


found: 0.92

[34mHandling part 195...[0m


100%|██████████| 331/331 [01:09<00:00,  4.79it/s]


found: 0.92

[34mHandling part 196...[0m


100%|██████████| 352/352 [01:06<00:00,  5.33it/s]


found: 0.93

[34mHandling part 197...[0m


100%|██████████| 345/345 [01:11<00:00,  4.80it/s]


found: 0.93

[34mHandling part 198...[0m


100%|██████████| 344/344 [01:03<00:00,  5.39it/s]


found: 0.94

[34mHandling part 199...[0m


100%|██████████| 326/326 [01:07<00:00,  4.84it/s]


found: 0.94

CPU times: user 8h 11min 22s, sys: 20min 44s, total: 8h 32min 7s
Wall time: 7h 14min 31s


In [7]:
embeddings

Unnamed: 0,image_id,0,1,2,3,4,5,6,7,8,...,502,503,504,505,506,507,508,509,510,511
0,0,0.040063,0.570188,-0.205856,-0.207147,-0.685156,0.875572,0.195525,0.263069,-0.065040,...,-0.054556,-0.408539,-0.021889,-0.510256,0.099593,0.378148,-0.282832,0.130747,-0.129671,0.143439
1,1,-0.231017,0.176313,0.341615,-0.343364,0.000140,0.071841,-0.101578,-0.705418,0.642879,...,0.357738,0.593474,0.244515,-0.143392,-0.100900,-0.164380,-0.514502,1.008251,-0.200839,-0.380208
2,2,-0.167542,0.041370,-0.097727,0.040258,-0.048385,-0.409488,-0.218555,-0.256703,-0.324521,...,-0.294237,0.239773,-0.184432,0.148067,0.111601,0.057123,-0.116048,0.622953,0.016619,-0.206443
3,3,-0.185815,0.233224,-0.220207,-0.164292,0.393449,0.338707,0.069729,-0.118172,-0.120118,...,-0.227938,0.228972,0.464048,-0.394989,0.091809,-0.206579,0.053853,0.171857,0.299081,0.341985
4,4,-0.070075,0.404768,-0.008263,0.457253,-0.185769,0.590136,0.592431,-0.248257,-0.056639,...,0.061216,0.476642,0.037116,0.096425,-0.220578,0.316117,-0.340564,1.100186,-0.184208,0.103563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69475,69475,-0.439497,0.024630,0.029694,0.313401,0.049096,0.196326,0.087322,0.011149,-0.195162,...,-0.122056,0.569884,0.877554,0.347739,-0.220037,-0.376298,-0.532710,0.017886,0.272139,0.416286
69476,69476,-0.336727,0.105042,-0.014149,-0.100525,-0.060500,-0.326088,0.016037,0.555359,0.188496,...,-0.092925,-0.141165,-0.001366,0.181614,0.395988,-0.218431,-0.261715,0.478720,-0.122022,-0.443649
69477,69477,-0.363973,0.260711,-0.059261,0.522644,0.079741,-0.041364,0.136542,-0.057749,0.121418,...,-0.130326,0.152554,0.655627,0.368892,-0.085061,-0.249023,-0.213025,0.777652,0.194381,0.108485
69478,69478,-0.068141,0.252226,-0.018986,0.265419,-0.407583,-0.146289,0.364292,-0.012511,-0.348367,...,-0.235141,0.199773,0.046873,0.079650,0.300423,-0.456441,-0.259283,0.432832,0.052929,0.273239


In [8]:
sizes['pixel_size_ratio'] = sizes['width'] / (sizes['height'] + 1e-3)
sizes['pixel_area'] = np.log1p(sizes['width'] * sizes['height'])

sizes.rename(columns={'width': 'pixel_width', 'height': 'pixel_height'}, inplace=True)

sizes

Unnamed: 0,image_id,pixel_width,pixel_height,pixel_size_ratio,pixel_area
0,0,300.0,200.0,1.499993,11.002117
1,1,300.0,198.0,1.515144,10.992066
2,2,300.0,204.0,1.470581,11.021919
3,3,300.0,397.0,0.755666,11.687727
4,4,300.0,225.0,1.333327,11.119898
...,...,...,...,...,...
69475,69475,300.0,235.0,1.276590,11.163382
69476,69476,300.0,235.0,1.276590,11.163382
69477,69477,300.0,151.0,1.986742,10.721084
69478,69478,300.0,209.0,1.435400,11.046133


In [9]:
sizes[sizes['pixel_width'].notna()]

Unnamed: 0,image_id,pixel_width,pixel_height,pixel_size_ratio,pixel_area
0,0,300.0,200.0,1.499993,11.002117
1,1,300.0,198.0,1.515144,10.992066
2,2,300.0,204.0,1.470581,11.021919
3,3,300.0,397.0,0.755666,11.687727
4,4,300.0,225.0,1.333327,11.119898
...,...,...,...,...,...
69475,69475,300.0,235.0,1.276590,11.163382
69476,69476,300.0,235.0,1.276590,11.163382
69477,69477,300.0,151.0,1.986742,10.721084
69478,69478,300.0,209.0,1.435400,11.046133


In [10]:
with open('image_embeddings.pickle', 'wb') as file:
    cloudpickle.dump(embeddings[range(512)].values, file)
    
sizes.to_csv('sizes.csv', index=False)

In [11]:
!rm pixels.csv