In [3]:
import pandas as pd
import random
import os
import numpy as np
import pickle

# load word vecs

### fastText

In [4]:
data_path = r"C:\Users\alexa\Documents\data\wir\project"
fname = r"cc.en.150.bin"

In [5]:
%%time
import fasttext
emb = fasttext.load_model(
    os.path.join(data_path, fname))

Wall time: 18.6 s


In [6]:
from imgsearch import cosine_sim
import re

def getwv(s, model):
    return model.get_word_vector(s.lower())

def getwv_multi_labels(labels, model):
    #sublabels = labels.split(', ')
    sublabels = re.split(r'[, ]\s*', labels)
    return sum([getwv(subl, model) for subl in  sublabels]) / len(sublabels)

### GloVe

In [90]:
%%time
import csv
data_path = r"C:\Users\alexa\Documents\data\glove.6B"
glove_data_file = 'glove.6B.50d.txt'
emb = pd.read_table(os.path.join(data_path, glove_data_file), sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

Wall time: 5.04 s


In [91]:
from imgsearch import cosine_sim
import re

exception_series = pd.Series([1e-5 for i in range(50)], index=list(range(1,51)))

def getwv(s, emb):
    try:
        return emb.loc[s.lower()]
    except:
        return exception_series

def getwv_multi_labels(labels, emb):
    #sublabels = labels.split(', ')
    sublabels = re.split(r'[, ]\s*', labels)
    return sum([getwv(subl, emb) for subl in  sublabels]) / len(sublabels)

# build search routine

### load img embeddings

In [12]:
with open(r"static/img_emb_dict_150d_fasttext.pkl", "rb") as f:
    img_emb_dict = pickle.load(f)

In [13]:
def image_search_res5(query, img_emb_dict, emb):
    
    # get query embedding representation
    q = getwv_multi_labels(query, emb)
    
    if q.sum() != 0:
        
        # parse query
        # ...
        
        # map image ids to names
        img_id_to_name = dict(zip(labelled_df.img_id, labelled_df.image_name))
        
        # score and rank images
        img_idxs = img_emb_dict.keys()
        score = [(
            img_id_to_name[key]
            #,cosine_sim(q, img_emb_dict[key]) # img emb = weighted sum of vec
            ,sum([weight * cosine_sim(q, vec) for vec, weight in img_emb_dict[key]]) # img emb = [(vec, weight)]
            #,sum([weight * cosine_sim(q, vec) for vec, weight in img_emb_dict[key]][:3]) # img emb = [(vec, 1)]
        ) for key in img_idxs]
        ranked = sorted(score, key=lambda tup: tup[1], reverse=True)
    
        # output list of images and proba
        return ranked[:12]
    else:
        return []

In [14]:
%%time
query = 'shoes'
image_search_res5(query, img_emb_dict, emb)

Wall time: 1.94 s


[('1359-11714-large_default-t-shirt-wonderlandes.jpg', 0.6324445790672139),
 ('1020-10242-large_default-t-shirt-la-belle-vie.jpg', 0.632433138300738),
 ('1020-8733-large_default-t-shirt-la-belle-vie.jpg', 0.6324279881868502),
 ('1359-13044-large_default-t-shirt-wonderlandes.jpg', 0.6324220065753512),
 ('1201-10244-large_default-t-shirt-oh-la-la.jpg', 0.6322406590341979),
 ('1593-13046-large_default-t-shirt-jeu-de-mains.jpg', 0.6322229791388215),
 ('1021-8792-large_default-t-shirt-mauvaise-reputation.jpg',
  0.6321523631058362),
 ('1021-8735-large_default-t-shirt-mauvaise-reputation.jpg',
  0.632150033519915),
 ('1360-11514-large_default-t-shirt-fluctuat-nec-mergitur.jpg',
  0.6318065426074744),
 ('1822-14889-large_default-t-shirt-serengueti.jpg', 0.6315953772175923),
 ('1019-10310-large_default-t-shirt-bear-and-sun.jpg', 0.6313840013274622),
 ('1592-13403-large_default-t-shirt-think-green.jpg', 0.6312716568081105)]

In [105]:
%%time
query = 'shoe'
image_search_res5(query, img_emb_dict, emb)

Wall time: 2.56 s


[('1337-11973-large_default-maillot-de-bain-apnee.jpg', 0.6898951391501433),
 ('1359-13044-large_default-t-shirt-wonderlandes.jpg', 0.6632117649964778),
 ('1359-11714-large_default-t-shirt-wonderlandes.jpg', 0.6632077507946511),
 ('1020-10242-large_default-t-shirt-la-belle-vie.jpg', 0.6631779094229434),
 ('1020-8733-large_default-t-shirt-la-belle-vie.jpg', 0.6631121647741495),
 ('1593-13046-large_default-t-shirt-jeu-de-mains.jpg', 0.6627354604149222),
 ('1021-8792-large_default-t-shirt-mauvaise-reputation.jpg',
  0.6625222204742057),
 ('1201-10244-large_default-t-shirt-oh-la-la.jpg', 0.6624246590254446),
 ('1360-11514-large_default-t-shirt-fluctuat-nec-mergitur.jpg',
  0.6624216284902036),
 ('1021-8735-large_default-t-shirt-mauvaise-reputation.jpg',
  0.6621673283056734),
 ('1822-14889-large_default-t-shirt-serengueti.jpg', 0.6618654047730939),
 ('1592-13403-large_default-t-shirt-think-green.jpg', 0.661746119532137)]

### query images and save output in static folder

In [65]:
query = 'bicyle'
image_list = image_search_res5(query, img_emb_dict, model)
#with open(r"static/image_list.pkl", "wb") as f:
#    pickle.dump(image_list, f)
image_list

[('965-8361-large_default-un-rendez-vous-chez-le-barbier.jpg',
  0.5948124769326235),
 ('1797-14438-large_default-casque-de-velo-urbain.jpg', 0.49601409581487826),
 ('1797-14434-large_default-casque-de-velo-urbain.jpg', 0.41289129330575647),
 ('1797-14411-large_default-casque-de-velo-urbain.jpg', 0.32578944563839196),
 ('1607-13245-large_default-gants-en-cuir-double-cachemire-by-dents.jpg',
  0.325553129840431),
 ('626-5109-large_default-adopter-vos-pieds-de-vigne.jpg', 0.3245709244848521),
 ('1797-14421-large_default-casque-de-velo-urbain.jpg', 0.324302362326393),
 ('1797-14413-large_default-casque-de-velo-urbain.jpg', 0.32354706392542115),
 ('1797-14418-large_default-casque-de-velo-urbain.jpg', 0.32161969861347534),
 ('1797-14412-large_default-casque-de-velo-urbain.jpg', 0.31081113567197055),
 ('1621-13437-large_default-organisateur-high-tech-bellroy.jpg',
  0.30456713148728515),
 ('965-8359-large_default-un-rendez-vous-chez-le-barbier.jpg',
  0.2930161408907653)]

### inspect single image and assess relevance with query

In [117]:
def inspect_img_vs_query(query, img_id, emb):
    print(
        labelled_df.query("img_id == '{}'".format(img_id)).image_name.values[0])
    scores = [(weight, cosine_sim(getwv_multi_labels(query, emb), vec)) for vec, weight in img_emb_dict['{}'.format(img_id)][:3]]
    scores = [(weight * sim, weight, sim) for weight, sim in scores]
    print(
        sum([score[0] for score in scores]))
    print(
        labelled_df.query("img_id == '{}'".format(img_id)).loc[:,'content'].values[0][:3])
    return scores

In [523]:
inspect_img_vs_query('bicyle', 13437)

1621-13437-large_default-organisateur-high-tech-bellroy.jpg
0.29762976293333027
((0.9043031930923462, 'seat belt, seatbelt'), (0.03341376408934593, 'scabbard'), (0.018360324203968048, 'buckle'))


[(0.2875137230397229, 0.9043031930923462, 0.31793952),
 (0.0065252400276893385, 0.03341376408934593, 0.19528599),
 (0.0035907998659180373, 0.018360324203968048, 0.19557388)]

# generate img embeddings

In [7]:
# img emb = weighted sum of vec
get_weighted_sum_vec = lambda tup: sum([getwv(label, emb) * weight for weight, label in tup])
get_weighted_sum_vec_multilbl = lambda tup: sum([getwv_multi_labels(label, emb) * weight for weight, label in tup])

# img emb = [(vec, weight)]
get_tup_vec_weigth = lambda tup: [(getwv(label, emb), weight) for weight, label in tup]
get_tup_vec_weight_multilbl = lambda tup: [(getwv_multi_labels(label, emb), weight) for weight, label in tup]

# img emb = vec of first label
get_vec_first_label = lambda tup: sum([getwv_multi_labels(label, emb) for weight, label in [tup[0]]])

In [8]:
path='static/img_top10_labels_w_weights.csv'
    
# load image label data
labelled_df = pd.read_csv(path)

In [9]:
from ast import literal_eval
labelled_df.content = labelled_df.content.apply(literal_eval)

In [10]:
%%time
# takes about 20 minutes w/ GloVe
# takes about 3s w/ fastText
img_emb_dict = dict(zip(
        labelled_df.loc[:].img_id,
        #labelled_df.content.apply(get_weight_vec_multilbl) # img emb = weighted sum of vec
        #labelled_df.content.apply(get_vec_first_label) # mg emb = vec of first label
        labelled_df.loc[:].content.apply(get_tup_vec_weight_multilbl)# img emb = [(vec, weight)]
))

Wall time: 2.03 s


In [11]:
with open(r"static/img_emb_dict_150d_fasttext.pkl", "wb") as f:
    pickle.dump(img_emb_dict, f)

# add weights to image labels

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from imgsearch import cosine_sim
import numpy as np

In [9]:
#pd.read_csv('static/imagenet_classes_to_labels_map.txt')
with open('static/imagenet_classes_to_labels_map.txt') as f:
    labels = eval(f.read())

In [10]:
def get_topn_labels(row, n=10):
    outdf = row.astype(float).nlargest(n).reset_index()
    outdf.columns = ['label_id','percentage']
    outdf.label_id = outdf.label_id.astype(int)
    outdf['label_name'] = outdf.label_id.map(labels)
    return outdf

In [11]:
%%time

n_img = 10000

# get weight data
data_path = r"C:\Users\alexa\Documents\data\wir\project\img_proba_vecs_vgg19.csv"
dfw = pd.read_csv(data_path, low_memory=False)

# generate df with descr of top10 classes
labelled_df = dfw.loc[:, dfw.columns[:3]].copy()
labelled_df['content'] = ''

for i, image_name in enumerate(dfw.image_name.loc[:n_img]):
    topnlabels = get_topn_labels(dfw.loc[i, dfw.columns[3:]])
    labelled_df.loc[i, 'content'] = tuple(zip(topnlabels.percentage, topnlabels.label_name)) # keep weight info

# select top n images
labelled_df = labelled_df.loc[:n_img].copy()

Wall time: 1min 11s


In [12]:
path='static/img_top10_labels_w_weights.csv'
    
# load image label data
#labelled_df.to_csv(path, index=False)