<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Baseline-(only-phash)" data-toc-modified-id="Baseline-(only-phash)-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Baseline (only phash)</a></span></li><li><span><a href="#Image-embeddings-with-our-model" data-toc-modified-id="Image-embeddings-with-our-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Image embeddings with our model</a></span></li><li><span><a href="#Text" data-toc-modified-id="Text-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Text</a></span></li><li><span><a href="#Phash" data-toc-modified-id="Phash-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Phash</a></span></li><li><span><a href="#Efficient-net" data-toc-modified-id="Efficient-net-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Efficient net</a></span></li></ul></div>

In [1]:
import numpy as np, pandas as pd, gc
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from functools import partial
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
import cv2

## Baseline (only phash)

In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
train['target'] = train.label_group.map(tmp)

In [4]:
mp = train.groupby('image_phash').posting_id.agg('unique').to_dict()
train['oof'] = train.image_phash.map(mp)

In [5]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score


In [6]:
def combine_for_sub(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return ' '.join( np.unique(x) )

def combine_for_cv(cols, row ):
    x = np.concatenate([row[c] for c in cols])
    return np.unique(x)

def test_preds(col_preds=['preds_image', 'preds_text', 'preds_phash']) :
    test['oof'] = test.apply(partial(combine_for_cv, col_preds),axis=1)
    test['f1'] = test.apply(getMetric('oof'),axis=1)
    print('CV Score =', test.f1.mean() )

In [7]:
train['f1'] = train.apply(getMetric('oof'),axis=1)
print('CV score for baseline =',train.f1.mean())

CV score for baseline = 0.5530933399167943


In [8]:
test = pd.read_csv('data/train.csv')
#test = cudf.DataFrame(test)
print('Using train as test to compute CV (since commit notebook). Shape is', test.shape )

Using train as test to compute CV (since commit notebook). Shape is (34250, 5)


In [9]:
tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
test['target'] = test.label_group.map(tmp)

## Image embeddings with our model

In [10]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=600, batch_size=32, path='', resize=True): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        self.resize = resize
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            if self.resize : X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [11]:
BASE = 'data/train_images/'

WGT = '../input/eef-weights/effb0.h5'
model = EfficientNetB0(weights='imagenet',include_top=False, pooling='avg', input_shape=None)

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
    test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path=BASE)
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=1)
    embeds.append(image_embeddings)

    #if i>=1: break
    
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)


Computing image embeddings...
chunk 0 to 4096
  3/128 [..............................] - ETA: 11:00

Process Keras_worker_ForkPoolWorker-1:
Traceback (most recent call last):
  File "/home/louis/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/louis/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/louis/anaconda3/lib/python3.7/multiprocessing/pool.py", line 127, in worker
    put((job, i, result))
  File "/home/louis/anaconda3/lib/python3.7/multiprocessing/queues.py", line 364, in put
    self._writer.send_bytes(obj)
  File "/home/louis/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/louis/anaconda3/lib/python3.7/multiprocessing/connection.py", line 398, in _send_bytes
    self._send(buf)
  File "/home/louis/anaconda3/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
KeyboardInterrupt
Process Keras_worker_ForkPoo

KeyboardInterrupt: 

In [28]:
torch.save(image_embeddings, 'data/effnet_embs/tf_embs_b0_256.pth')

In [None]:
#model.save_weights('effb0.h5', save_format='h5')

In [27]:
KNN = 50
if len(test)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)

NearestNeighbors(n_neighbors=50, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7f14c75bffb0>, algorithm='brute', metric='euclidean', p=2, algo_params=None, metric_params=None, output_type='input')

In [42]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(image_embeddings)//CHUNK
if len(image_embeddings)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(image_embeddings))
    print('chunk',a,'to',b)
    distances, indices = model.kneighbors(image_embeddings[a:b,])
    
    for k in range(b-a):
        IDX = np.where(distances[k,]<0.3)[0]
        IDS = indices[k,IDX]
        o = test.iloc[IDS].posting_id.values
        preds.append(o)

Finding similar images...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250


In [43]:
test['preds_image'] = preds
test.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,preds_image,target,oof,f1
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,[train_129225211],"[train_129225211, train_2278313361]","[train_129225211, train_2127235708]",0.5
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]","[train_1816968361, train_2120597446, train_338...",0.666667
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,[train_2288590299],"[train_2288590299, train_3803689425]",[train_2288590299],0.666667
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_1593362411, train_147...","[train_2406599165, train_3342059966]","[train_1114961734, train_1254601165, train_147...",0.057143
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,[train_3369186413],"[train_3369186413, train_921438619]","[train_3369186413, train_921438619]",1.0


In [44]:
test_preds(col_preds=['preds_image'])

CV Score = 0.6423777857935921


## Text

In [45]:
print('Computing text embeddings...')
model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = model.fit_transform(test_gf.title).toarray()
print('text embeddings shape',text_embeddings.shape)

Computing text embeddings...
text embeddings shape (34250, 24939)


In [46]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b-a):
        IDX = cupy.where(cts[k,]>0.7)[0]
        o = test.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        
del model, text_embeddings
_ = gc.collect()

Finding similar titles...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
chunk 16384 to 20480
chunk 20480 to 24576
chunk 24576 to 28672
chunk 28672 to 32768
chunk 32768 to 34250


In [47]:
test['preds_text'] = preds
test.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,preds_image,target,oof,f1,preds_text
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,[train_129225211],"[train_129225211, train_2278313361]",[train_129225211],0.666667,"[train_129225211, train_2278313361]"
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]",1.0,[train_3386243561]
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,[train_2288590299],"[train_2288590299, train_3803689425]",[train_2288590299],0.666667,[train_2288590299]
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_1593362411, train_147...","[train_2406599165, train_3342059966]","[train_1470643555, train_1593362411, train_240...",0.25,"[train_2406599165, train_3576714541, train_150..."
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,[train_3369186413],"[train_3369186413, train_921438619]",[train_3369186413],0.666667,[train_3369186413]


In [48]:
test_preds(col_preds=['preds_text'])

CV Score = 0.6139718474362906


In [49]:
test_preds(col_preds=['preds_text', 'preds_image'])

CV Score = 0.7059193217217656


## Phash

In [50]:
tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
test['preds_phash'] = test.image_phash.map(tmp)
test.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,preds_image,target,oof,f1,preds_text,preds_phash
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,[train_129225211],"[train_129225211, train_2278313361]","[train_129225211, train_2278313361]",1.0,"[train_129225211, train_2278313361]",[train_129225211]
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,"[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]","[train_3386243561, train_3423213080]",1.0,[train_3386243561],[train_3386243561]
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,[train_2288590299],"[train_2288590299, train_3803689425]",[train_2288590299],0.666667,[train_2288590299],[train_2288590299]
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,"[train_2406599165, train_1593362411, train_147...","[train_2406599165, train_3342059966]","[train_1470643555, train_1508100548, train_159...",0.181818,"[train_2406599165, train_3576714541, train_150...",[train_2406599165]
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,[train_3369186413],"[train_3369186413, train_921438619]",[train_3369186413],0.666667,[train_3369186413],[train_3369186413]


In [51]:
test_preds(col_preds=['preds_phash'])

CV Score = 0.5530933399168149


In [52]:
test_preds()

CV Score = 0.7059235996000499


## Efficient net

In [59]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [69]:
model = EfficientNetB0(weights='imagenet',include_top=False, pooling='avg', input_shape=None)

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
    test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path='data/train_images/')
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)

    #if i>=1: break
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)

Computing image embeddings...
chunk 0 to 4096
chunk 4096 to 8192
chunk 8192 to 12288
chunk 12288 to 16384
 13/128 [==>...........................] - ETA: 1:39

Process Keras_worker_ForkPoolWorker-28:
Process Keras_worker_ForkPoolWorker-27:
Process Keras_worker_ForkPoolWorker-25:
Process Keras_worker_ForkPoolWorker-26:
Traceback (most recent call last):


In [68]:
%debug


> [0;32m<ipython-input-59-d01fd3b36be9>[0m(28)[0;36m__data_generation[0;34m()[0m
[0;32m     25 [0;31m        [0mdf[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mdf[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0mindexes[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     26 [0;31m        [0;32mfor[0m [0mi[0m[0;34m,[0m[0;34m([0m[0mindex[0m[0;34m,[0m[0mrow[0m[0;34m)[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mdf[0m[0;34m.[0m[0miterrows[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     27 [0;31m            [0mimg[0m [0;34m=[0m [0mcv2[0m[0;34m.[0m[0mimread[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mpath[0m[0;34m+[0m[0mrow[0m[0;34m.[0m[0mimage[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 28 [0;31m            [0mX[0m[0;34m[[0m[0mi[0m[0;34m,[0m[0;34m][0m [0;34m=[0m [0mcv2[0m[0;34m.[0m[0mresize[0m[0;34m([0m[0mimg[0m[0;34m,[0m[0;34m([0m[0mself[0m[0;34m.[0m