In [1]:
import numpy as np
import re
import torch
from torch.utils.data import TensorDataset, DataLoader

from transformers import BertModel, BertTokenizer
from transformers import RobertaTokenizer, RobertaModel
from transformers import DistilBertTokenizer, DistilBertModel

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import time
from sklearn.metrics import silhouette_score
import pickle

In [2]:
!nvidia-smi

In [3]:
# kmeanstf uses gpu for computation it's faster a bit
!pip install kmeanstf --quiet

In [4]:
# reading files
categories = list()
labels_all = list()
scores = list()

with open('../input/hyppr-images-mapping/categories.txt', 'r') as file:
    for line in file.readlines():
        categories.append(line.lower().rstrip().split(','))
        
with open('../input/hyppr-images-mapping/labels.txt', 'r') as file:
    for line in file.readlines():
        labels_all.append(line.rstrip(',\n').split(','))

with open('../input/hyppr-images-mapping/scores.txt', 'r') as file:
    for line in file.readlines():
        scores.append(line.rstrip(',\n').split(','))
        

with open("../input/hyppr-images-mapping/all_ids.p", "rb") as f:
    all_ids = pickle.load(f)

In [5]:
labels = [lab for label in labels_all for lab in label]
labels = list(set(labels))
cats = [cat[0] for cat in categories]

In [6]:
labels_n = [" ".join(my_list) for my_list in labels_all.copy()]
labels_n = [" ".join(re.split(' |-', my_list)) for my_list in labels_n]

categories_n = [" ".join(my_list) for my_list in categories.copy()]

In [7]:
# huggingface offers using this framework, it's unable to keep up large collections of embeddings though, since there is no garbage collector
# inside. Use DataLoader from PyTorch or manual GPU computation instead

# from transformers import pipeline

# nlp_features = pipeline('feature-extraction')
# out = nlp_features(labels_n)

# out = np.array(out)

# out = out[:, 8, :]

# a, b = 123, 36
# print(labels[a], labels[b])
# cosine_similarity([out[a]], [out[b]])

# Models loading

In [9]:
# model = BertModel.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = DistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

## Categories mapping

In [None]:
# r_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# r_model = RobertaModel.from_pretrained('roberta-base')

In [10]:
# since we need to translate large collection, we can use simple model(ids, mask)
tokenized_categories = tokenizer(categories_n, padding = True, truncation = True, return_tensors = 'pt')

ids = tokenized_categories['input_ids']
mask = tokenized_categories['attention_mask']   

model.cpu()

output = model(ids, mask)
final_layer = output.last_hidden_state

cat_embeddings = final_layer.detach().numpy()

cat_embeddings = cat_embeddings[:, 7, :]

## Labels mapping

In [11]:
tokenized_labels = tokenizer(labels_n, padding = True, truncation = True, return_tensors = 'pt')

input_ids = tokenized_labels['input_ids']
masks = tokenized_labels['attention_mask']

In [12]:
# since there is a lot of sentences we need to batch it 
batch_size = 16

train_data = TensorDataset(input_ids, masks)
#train_sampler = RandomSampler(train_data)
train_dl = DataLoader(train_data, batch_size=batch_size)

model.cuda()
out_list = []
model.eval()
with torch.no_grad():
    for d in train_dl:
        d = [i.cuda() for i in d[:3]]
        out_list.append(model(*d)[0][:, 7, :].cpu()) 
    del d
    
labels_embeddings = torch.cat(out_list)
labels_embeddings = labels_embeddings.numpy() 

# RoBERTa model:

In [None]:


tokenized_categories = r_tokenizer(categories_n, padding = True, truncation = True, return_tensors = 'pt')

ids = tokenized_categories['input_ids']
mask = tokenized_categories['attention_mask']   

r_model.cpu()
output = r_model(ids, mask)
final_layer = output.last_hidden_state

cat_embeddings = final_layer.detach().numpy()

cat_embeddings = cat_embeddings[:, 8, :]

tokenized_labels = r_tokenizer(labels_n, return_tensors="pt", padding = True)

input_ids = tokenized_labels['input_ids']
masks = tokenized_labels['attention_mask']

# since there is a lot of sentences we need to batch it 
batch_size = 32

train_data = TensorDataset(input_ids, masks)
train_dl = DataLoader(train_data, batch_size=batch_size)

r_model.cuda()
out_list = []
r_model.eval()
with torch.no_grad():
    for d in train_dl:
        d = [i.cuda() for i in d[:3]]
        out_list.append(r_model(*d)[0][:, 8, :].cpu()) 
    del d
    
labels_embeddings = torch.cat(out_list)
labels_embeddings = labels_embeddings.numpy() 
labels_embeddings.shape

# Normalizing the data

In [13]:
from sklearn.preprocessing import Normalizer
import umap
import matplotlib.pyplot as plt

tr_l_e = Normalizer().fit_transform(labels_embeddings)
tr_c_e = Normalizer().fit_transform(cat_embeddings)

reducer = umap.UMAP()
%time lab_emb_2d = reducer.fit_transform(tr_l_e)

plt.scatter(
    lab_emb_2d[:, 0],
    lab_emb_2d[:, 1]);

In [14]:
cosine_similarity([labels_embeddings[0]], [labels_embeddings[12]])

In [15]:
labels_embeddings = tr_l_e.copy() # copying reduced vectors here
cat_embeddings = tr_c_e.copy()

# Aggregating words embeddings into sentence embeddings
We will use weighted arithmetic mean aggregation here:
 \begin{equation*} \LARGE
 			\text{result}_j = \frac{\sum_i^{m_l} v_{ij} \cdot s_i}{\sum_i^{m_l} v_{ij}}, \; \forall j \in 1, ..., n.
 \end{equation*}
 Here $v_{ij}$ is a component of main matrix of embeddings, and $s_i$ is a weight of $i$-th element of Google API model.

In [23]:
def aggregating_words(labels, embeddings, generated_sentences):  # IF we will use aggregation (which may be not good since we don't actually have sentence). One of the models are
    dict_embeddings = {}
    for i in range(len(labels)):
        dict_embeddings.update({labels[i]:embeddings[i]})
    n = dict_embeddings[labels[0]].shape[0]

    images_embeddings = [] # we will find embeddings for an every image
    outer_elements = 0
    for image_ind in range(len(generated_sentences)): # going on list by lists, where last "lists" are images
        result_embedding = np.zeros(n)
        norm = np.zeros(n)
        for im_label in range(len(generated_sentences[image_ind])):
            try:
                emb_dot_score = np.dot(dict_embeddings[generated_sentences[image_ind][im_label]], float(scores[image_ind][im_label])) # vector on scalar -> vector
                norm = np.add(norm, dict_embeddings[generated_sentences[image_ind][im_label]] )#normalizing each component: find sum of vectors what is vector, than divide inner by norm
            except: # list index out of range
                outer_elements += 1

            result_embedding = np.add(result_embedding,emb_dot_score)
        result_embedding = np.divide(result_embedding, norm)
        images_embeddings.append(result_embedding)
    images_embeddings = np.array(images_embeddings)
    return images_embeddings

    print('Number of list index out of range elements: {}'.format(outer_elements))
    
# images_embeddings = aggregating_words(labels, labels_embeddings, labels_all)
# labels_embeddings = images_embeddings

# Dimensionality reduction

There are two well-known methods available: PCA and UMAP. t-SNE mainly used for vizualization purposes being slow, so it is out-of-the match for now.
Little comparison of PCA and UMAP:

1.  UMAP is potentially better since it causes to connect nearest neighbours making pre-clustering
2.  Also UMAP is better since it finds non-linear connections between data 

## Bringing PCA on

In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 150)

%time pca_labels = pca.fit_transform(labels_embeddings)

In [26]:
import matplotlib.pyplot as plt

pca_2d = PCA(n_components = 2)
%time lab_emb_2d = pca_2d.fit_transform(labels_embeddings)

plt.scatter(
    lab_emb_2d[:, 0],
    lab_emb_2d[:, 1])

Handled not so well, points are just uniformly distributed over the square

## Bringing UMAP on

In [16]:
import umap
import matplotlib.pyplot as plt

reducer = umap.UMAP()
%time lab_emb_2d = reducer.fit_transform(labels_embeddings)

lab_emb_2d.shape

plt.scatter(
    lab_emb_2d[:, 0],
    lab_emb_2d[:, 1])

In [17]:
def draw_umap(data = labels_embeddings, n_neighbors=15, min_dist=0.1, n_components=2, metric='cosine', title=''):
    fit = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric
    )
    
    start_time = time.time()
    u = fit.fit_transform(data)
    end_time = time.time()
    print('UMAP execution time in seconds: {}'.format(end_time - start_time))
    
    fig = plt.figure()
    if n_components == 1:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], range(len(u)), c=data)
    if n_components == 2:
        ax = fig.add_subplot(111)
        ax.scatter(u[:,0], u[:,1])
    if n_components == 3:
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(u[:,0], u[:,1], u[:,2], c=data, s=100)
    plt.title(title, fontsize=18)
    if n_neighbors != 15:
        plt.savefig('draw_umap_n_neighbors_' + str(n_neighbors)+ '.png', bbox_inches='tight')
    else:
        plt.savefig('draw_umap_min_dist_' + str(min_dist).replace('.', '0') + '.png', bbox_inches='tight')

In [18]:
for n in (5,10, 20, 35, 50):
    draw_umap(n_neighbors=n, title='n_neighbors = {}'.format(n))

In [19]:
for d in (0.0, 0.1, 0.25, 0.5, 0.8, 0.99):
    draw_umap(min_dist=d, title='min_dist = {}'.format(d))

So by graphs we're taking **min_dist** = 0.1 and **n_neighbors** = 10

In [20]:
draw_umap(n_neighbors=100, title='n_neighbors = 100')

In [21]:
reducer = umap.UMAP(n_components = 150, n_neighbors = 15,min_dist = 0.0, metric = 'cosine')
lab_emb_150d = reducer.fit_transform(labels_embeddings)

In [22]:
reducer = umap.UMAP(n_components = 150, n_neighbors = 2,min_dist = 0.0, metric = 'cosine', init= 'random')
cat_emb_150d = reducer.fit_transform(cat_embeddings)

In [23]:
import seaborn as sn
import pandas as pd
df_cm = pd.DataFrame(cosine_similarity(cat_embeddings), index = [i for i in range(len(cats))],
                  columns = [i for i in range(len(cats))])
plt.figure(figsize = (10, 7))
sn.heatmap(df_cm,cmap="YlGnBu",linewidths=1, annot = True, fmt = '.2f')
plt.savefig('cat1.png', bbox_inches='tight')

In [24]:
df_cm = pd.DataFrame(cosine_similarity(cat_emb_150d), index = [i for i in range(len(cats))],
                  columns = [i for i in range(len(cats))])
plt.figure(figsize = (10, 7))
sn.heatmap(df_cm,cmap="YlGnBu",linewidths=1, annot = True, fmt = '.2f')
plt.savefig('cat2.png', bbox_inches='tight')

In [25]:
cosine_similarity([lab_emb_150d[0]], [lab_emb_150d[0]])

In [26]:
cosine_similarity([labels_embeddings[0]], [labels_embeddings[4]])

# Dict Embeddings:

In [29]:
def set_key(dictionary, key, value):
    if key not in dictionary:
        if type(value) == list:
            dictionary[key] = [value]
        else:
            dictionary[key] = value
    elif type(dictionary[key]) == list:
         dictionary[key].append(value)
    else:
         dictionary[key] = [dictionary[key], value]

In [30]:
dict_embeddings = dict()
for i in range(len(all_ids)):
    dict_embeddings[all_ids[i]] = lab_emb_150d[i]

In [31]:
with open("../input/hyppr-images-mapping/objid_postid.p", "rb") as f:
    objid_postid = pickle.load(f)
# reverse:
postid_objid = dict()
for objid, postid in objid_postid.items():
    set_key(postid_objid, postid, objid)

In [32]:
with open('../input/hyppr-images-mapping/category_to_posts_vision.p', 'rb') as f:   # opening given model
    category_to_posts = pickle.load(f)
category_to_posts = dict(category_to_posts)

posts_to_category = dict()
for cat, posts in category_to_posts.items():
    for post in posts:
        if post not in posts_to_category.keys():
            set_key(posts_to_category, post, cat)

# Clusterization

In [33]:
import tensorflow as tf
from kmeanstf import KMeansTF
from sklearn.metrics import silhouette_score

# COMPARISON UMAP AND PCA GRAPHICS:

In [34]:
silhouettes_umap_f = []

distortions_f = [] # making this for using elbow method in next cells

K = range(2,30,2)
for k in K:
    start_time = time.time()
    kmeanstf = KMeansTF(n_clusters = k, random_state = 21)
    
    labels_embeddings_tf = tf.convert_to_tensor(lab_emb_150d)  # learning on umap first
    kmeanstf.fit(labels_embeddings_tf)
    silhouettes_umap_f.append(silhouette_score(lab_emb_150d, kmeanstf.labels_))
    
    distortions_f.append(kmeanstf.inertia_)
    
    end_time = time.time()
    print('KmeansTF execution time in seconds: {}'.format(end_time - start_time))

In [36]:
fig, ax = plt.subplots(1, 2, figsize = (16, 8), sharey = True)
ax[0].plot(K, silhouettes_umap_f, 'yx-')
ax[0].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'Silhouette for UMAP, k = 2 - 30, n_neighbors = 5, min_dist = 0.1')
# plt.yscale('log')
ax[1].plot(K, silhouettes_pca_f, 'cx-')
ax[1].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'Silhouette for PCA, k = 2 - 30')
plt.savefig('silhouettes_comparison.pdf', bbox_inches='tight')

In [None]:
silhouettes_umap_s = []

distortions_s = []

K = range(30,150,3)
for k in K:
    start_time = time.time()
    kmeanstf = KMeansTF(n_clusters = k, random_state = 21)
    
    labels_embeddings_tf = tf.convert_to_tensor(lab_emb_150d)  # learning umap first
    kmeanstf.fit(labels_embeddings_tf)
    silhouettes_umap_s.append(silhouette_score(lab_emb_150d, kmeanstf.labels_))
    
    distortions_s.append(kmeanstf.inertia_)
    
    end_time = time.time()
    print('KmeansTF execution time in seconds: {}'.format(end_time - start_time))

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (16, 8), sharey = True)
ax[0].plot(K, silhouettes_umap_s, 'yx-')
ax[0].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'Silhouette for UMAP, k = 30 - 150, n_neighbors = 5, min_dist = 0.1')
# plt.yscale('log')
ax[1].plot(K, silhouettes_pca_s, 'cx-')
ax[1].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'Silhouette for PCA, k = 30 - 150.')
plt.savefig('silhouettes_comparison2.pdf', bbox_inches='tight')

## Now we need to find out optimal number of clusters.
### Since Elbow Method works not always fine, we need to compare it with Average Silhouette Method and find out which is better

# Comparison of Elbow Method and Average Silhouette

In [None]:
#lab_emb_150d = labels_embeddings.copy() # using PCA finally # now we wont do that since PCA works bad
#cat_emb_150d = cat_embeddings.copy()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (16, 8))
ax[0].plot(range(2,30,2), distortions_f, 'bx-')
ax[0].set(xlabel = 'k', ylabel = 'Distortion', title = 'The Elbow Method. k = ')
ax[0].set_yscale("log")
ax[1].plot(range(2,30,2), silhouettes_umap_f, 'rx-')
ax[1].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'The Average Silhouette Method')
plt.savefig('elbowSilhouette1.png', bbox_inches='tight')

In [None]:
max_silh_id = np.argmax(silhouettes_umap_f)
max_silh = silhouettes_umap_f[max_silh_id]

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (16, 8))
ax[0].plot(K, distortions_s, 'bx-')
ax[0].set(xlabel = 'k', ylabel = 'Distortion', title = 'The Elbow Method')
ax[0].set_yscale('log')
ax[1].plot(K, silhouettes_umap_s, 'rx-')
ax[1].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'The Average Silhouette Method')
plt.savefig('elbowSilhouette2.png', bbox_inches='tight')

In [None]:
silhouettes_umap_t = []

distortions_t = []

K = range(150,210,3)
for k in K:
    start_time = time.time()
    kmeanstf = KMeansTF(n_clusters = k, random_state = 21)
    
    labels_embeddings_tf = tf.convert_to_tensor(lab_emb_150d)  # learning umap first
    kmeanstf.fit(labels_embeddings_tf)
    silhouettes_umap_t.append(silhouette_score(lab_emb_150d, kmeanstf.labels_))
    
    distortions_t.append(kmeanstf.inertia_)
    
    end_time = time.time()
    print('KmeansTF execution time in seconds: {}'.format(end_time - start_time))
    
fig, ax = plt.subplots(1, 2, figsize = (16, 8))
ax[0].plot(K, distortions_t, 'bx-')
ax[0].set(xlabel = 'k', ylabel = 'Distortion', title = 'The Elbow Method')
ax[0].set_yscale('log')
ax[1].plot(K, silhouettes_umap_t, 'rx-')
ax[1].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'The Average Silhouette Method')
plt.savefig('elbowSilhouette3.png', bbox_inches='tight')

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (16, 8))
ax[0].plot(K, distortions_t, 'bx-')
ax[0].set(xlabel = 'k', ylabel = 'Distortion', title = 'The Elbow Method')
ax[0].set_yscale('log')
ax[1].plot(K, silhouettes_umap_t, 'rx-')
ax[1].set(xlabel = 'k', ylabel = 'Silhouette score', title = 'The Average Silhouette Method')
plt.savefig('elbowSilhouette3.png', bbox_inches='tight')

### Find out where Silhouette is bigger:

In [None]:
silhouettes = silhouettes_umap_f + silhouettes_umap_s 
# on UMAP one could see a monotone graph, so around n_clusters = 15 is about right
if silhouettes[np.argmax(silhouettes)] > max_silh:
    max_silh_id = np.argmax(silhouettes)
n_themes = (max_silh_id - len(silhouettes_umap_f)) * 3 + 30
n_themes

In [None]:
import hdbscan
from sklearn.metrics import pairwise_distances

distance = pairwise_distances(lab_emb_150d, metric='cosine')
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric='precomputed')
clusterer.fit(distance.astype('float64'))
clusterer.labels_

In [None]:
lab_emb_150d = labels_embeddings.copy()
cat_emb_150d = cat_embeddings.copy()

In [None]:
n_themes = 200
kmeanstf = KMeansTF(n_clusters = n_themes, random_state = 21)
labels_embeddings_tf = tf.convert_to_tensor(labels_embeddings)
kmeanstf.fit(labels_embeddings_tf)

In [None]:
print("Silhouette metric score: {}".format(silhouette_score(lab_emb_150d, kmeanstf.labels_)))

cluster_centers = kmeanstf.cluster_centers_
cluster_centers = cluster_centers.numpy()

cluster_labels = kmeanstf.labels_.numpy() # labels of each point
cluster_to_emb = dict()                        # making dictionary containing label of each cluster 
cluster_to_im = dict()                         # for the keys() and embeddings as elements of each cluster
objid_cluster = dict()

# we are going to make two dictionaries: cluster - embeddings of images; cluster - images distributions in cluster
for ID in range(cluster_labels.shape[0]):
    set_key(cluster_to_emb, cluster_labels[ID], labels_embeddings[ID].copy())
    set_key(cluster_to_im, cluster_labels[ID], labels_all[ID].copy())
    set_key(objid_cluster, all_ids[ID], cluster_labels[ID].copy())
    if type(labels_all[ID]) != list:
        print('f')
        print(labels_all[ID])

In [None]:
# cluster_centers = dict()
# for key, val in cluster_to_emb.items():
#     center = np.zeros(labels_embeddings.shape[1])
#     for emb in val:
#         center = np.add(center, emb)
#     center = np.divide(center, len(val))
#     set_key(cluster_centers, key, center)

In [None]:
# for idcl_lab in range(len(cluster_labels)):
#     if cluster_labels[idcl_lab] == 0:
#         print (idcl_lab)

## ONLY FOR OLD MODEL DICTIONARY:


In [None]:
with open('../input/hyppr-images-mapping/category_to_posts_vision.p', 'rb') as f:   # opening given model
    category_to_posts = pickle.load(f)
category_to_posts = dict(category_to_posts)

posts_to_category = dict()
for cat, posts in category_to_posts.items():
    for post in posts:
        if post not in posts_to_category.keys():
            set_key(posts_to_category, post, cat)

### Now we need to find closest vector to each of the given categories. Then we can map it.
### We'll use ```cosine_similarity``` function from ```sklearn.metrics.pairwise``` for cosine similarity metric for it

In [None]:
cl_cent = []
for i in range(n_themes):
    temp_emb = np.zeros(labels_embeddings.shape[1])
    for emb in cluster_to_emb[i]:
        temp_emb = np.add(temp_emb, labels_embeddings[i])
    cl_cent.append(np.divide(temp_emb, len(cluster_to_emb[i])) )

In [None]:
cl_cent = cluster_centers.copy()

In [None]:
most_similars = cosine_similarity(cat_embeddings, cl_cent)
most_similars.shape

# Let see with first category and nearest element:

In [None]:
#print(np.round(most_similars, 8)[0])
nearest_id = np.argmax(most_similars[0])

nearest_id

In [None]:
cluster_to_im[182]

In [None]:
cats

In [None]:
cluster_to_im[10]

In [None]:
most_similars_indices = np.argmax(most_similars, axis = 1)
curr_set_indices = np.arange(most_similars.shape[1]) # constructing a set for eleminating viewed parts

In [None]:
curr_set_indices = np.arange(most_similars.shape[1])
sim_indices = []
for i in range(min(most_similars.shape[0], most_similars.shape[1] ) ):   # we're bounded by n_clusters defined in KMeans. So max we can take n_clusters sim_indices. 
    # make a threshold, make something else
    sim_ind_index_old_arr = np.argmax(most_similars[i][curr_set_indices]) # taking every row and finding non-visited max in this row
    sim_ind = curr_set_indices[sim_ind_index_old_arr]
    curr_set_indices = np.delete(curr_set_indices, sim_ind_index_old_arr)
    sim_indices.append(sim_ind)

sim_indices            # in this vec we have indices of most similar vectors of clusters for categories we were given

# QUALITY COMPARISON

In [None]:
from sklearn.metrics.pairwise import cosine_similarity as cos
cos([lab_emb_150d[0]], [lab_emb_150d[4]])

In [None]:
cos_matrix = cos(cat_emb_150d, cluster_centers)
cos_matrix

In [None]:
def indices_of_similarity(sim_matrix, metric):
    curr_set_indices = np.arange(max(sim_matrix.shape))
    
    sim_indices = []
    for i in range(min(sim_matrix.shape)):   # we're bounded by n_clusters defined in KMeans. So max we can take n_clusters sim_indices. 
        # make a threshold, make something else
        if metric == 'euclidean':
            sim_ind_index_old_arr = np.argmin(sim_matrix[i][curr_set_indices]) # taking every row and finding non-visited max in this row
        elif metric == 'cosine':
            sim_ind_index_old_arr = np.argmax(sim_matrix[i][curr_set_indices])
        sim_ind = curr_set_indices[sim_ind_index_old_arr]
        curr_set_indices = np.delete(curr_set_indices, sim_ind_index_old_arr)
        sim_indices.append(sim_ind)

    return sim_indices  

In [None]:
cos_sim = indices_of_similarity(most_similars, 'cosine')
most_similar = cos_sim

In [None]:
most_similar

In [None]:
cluster_to_im[154]

In [None]:
sum1 = 0
for ind in most_similar:
    sum1 += len(cluster_to_im[ind])
sum1

In [None]:
objid_category = dict()   # resulting dictionary
for objid, clusterid in objid_cluster.items():
    sim_category_id = 0
    if clusterid in most_similar:
        sim_category_id = most_similar.index(clusterid)
                                          # we can interpretate it next way: "most similar TO cluster with index <index of cluster>"
        category_similar = categories[sim_category_id][0]    # we have a list with categories distributions.  \ 
                                                                # We take only first element --- true similar string of category
        set_key(objid_category, objid, category_similar)
        
category_objid = dict()
for objid, cat in objid_category.items():
    set_key(category_objid, cat, objid)

In [None]:
with open('../input/hyppr-images-mapping/category_to_posts_vision.p', 'rb') as f:   # opening given model
    category_to_posts = pickle.load(f)
category_to_posts = dict(category_to_posts)
with open('../input/hyppr-images-mapping/objid_postid.p', 'rb') as f:   # opening for making category_postid dictionary
    objid_postid = pickle.load(f)

In [None]:
len(objid_postid)

In [None]:
category_postid = dict()
no_obj = 0
for cat, objid in category_objid.items():
    for obj in objid:
        try:
            set_key(category_postid, cat, objid_postid[obj])
        except KeyError as e:
            #print(KeyError)
            no_obj += 1
print(no_obj)

In [None]:
num_els = 0
for cat, idlist in category_postid.items():
    num_els += len(idlist)
num_els

In [None]:
num_els = 0
for cat, idlist in category_to_posts.items():
    num_els += len(idlist)
num_els

In [None]:
right = 0
for cat, idlist in category_to_posts.items():
    for idpost in idlist:
        if idpost in category_postid[cat]:
            right += 1   
right

In [None]:
with open("./objid_category.p", "wb") as f:
    pickle.dump(objid_category, f)

## Doing reverse dictionaries:

In [None]:
posts_to_category = dict()
postid_category = dict()

for cat, posts in category_to_posts.items():
    for post in posts:
        set_key(posts_to_category, post, cat)
    
for cat, posts in category_postid.items():
    for post in posts:
        set_key(postid_category, post, cat)

In [None]:
with open("../input/hyppr-images-mapping/objid_mark_category.p", "rb") as f:
       objid_mark_category = pickle.load(f)
with open("../input/hyppr-images-mapping/yRight.p", "rb") as f:
       yRight = pickle.load(f)
with open("../input/hyppr-images-mapping/objid_postid.p", "rb") as f:
    objid_postid = pickle.load(f)
# reverse:
postid_objid = dict()
for objid, postid in objid_postid.items():
    set_key(postid_objid, postid, objid)
with open("../input/hyppr-images-mapping/postid_objurls.p", "rb") as f:
    postid_objurls = pickle.load(f)

In [None]:
yDef,yNew = [], []
no_post, no_topic, no_labels = 0, 0, 0
arguable_data = dict()
sec_dict_values = dict()
for post, cat in posts_to_category.items():
    no_data = False
    if post in postid_objid.keys():        
        objid = postid_objid[post]
        if objid in objid_category.keys():
            cat_from_new_model = objid_category[objid]
            yNew.append(cat_from_new_model)
        else:
            no_topic += 1
            yNew.append('none')
    else:
        no_post += 1
        no_data = True
        
    if no_data is False:
        objid = postid_objid[post]
        if objid in objid_mark_category.keys():
            yDef.append(objid_mark_category[objid])
        elif type(cat) is list:
            yDef.append(cat[0])
            no_labels +=1
        else:
            yDef.append(cat)
            no_labels +=1
        if yNew[-1] != yDef[-1]:   # creating distributions  for arguable_data
            url = postid_objurls[post]
            if type(url) == list:
                url = url[0]
            
#             set_key(arguable_data, cat, url)
#             set_key(sec_dict_values, cat, yNew[-1])
print('There is {} post missing'.format(no_post))
print('There is {} obj to topic missing'.format(no_topic))

In [None]:
len(yDef)

In [None]:
#visible_obj = [cat for cat, posts in category_to_posts.items()]   # len of category_to_posts is 10 so we need to truncate 
                                                                    # len of BERT model result
yDef = list()
yBert = list()

for post, cat in posts_to_category.items():
    if post in postid_category.keys():
        el = postid_category[post]
        if type(el) is list:
            yBert.append(el[0])
        else:
            yBert.append(el)
    else:
        yBert.append('none')
        
    if type(cat) is list:
        yDef.append(cat[0])
    else:
        yDef.append(cat)
# for post, cat in postid_category.items():
#     if post in posts_to_category.keys():
#         pass
#     else:
#         yBert.append(cat)
#         yDef.append('none')

In [None]:
nodata = ['dance', 'entertainment', 'tech']  # we don't have data on cite for this categories so we can't consider data communicated with it
cats = [cat[0] for cat in categories if cat[0] not in nodata]   # so we just throw it away
cats.append('none')

In [None]:
from sklearn.metrics import confusion_matrix

conf_matr = confusion_matrix(yDef, yNew, labels = cats)

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(conf_matr, index = [i for i in range(len(cats))],
                  columns = [i for i in range(len(cats))])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm,cmap="YlGnBu",linewidths=1, annot=True, fmt = 'd')
plt.savefig('bertresult.png', bbox_inches = 'tight')

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

f1_score(yDef,yNew, labels = cats, average = 'micro')