In [None]:
!pip install gensim --quiet

In [None]:
import gensim, gensim.downloader
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import requests
import os
import time
import matplotlib.pyplot as plt

In [None]:
# reading files
categories = list()
labels_all = list()
scores = list()
with open('../input/hyppr-images-mapping/categories.txt', 'r') as file:
    for line in file.readlines():
        categories.append(line.lower().rstrip().split(','))
        
with open('../input/hyppr-images-mapping/labels.txt', 'r') as file:
    for line in file.readlines():
        labels_all.append(line.rstrip(',\n').split(','))

with open('../input/hyppr-images-mapping/scores.txt', 'r') as file:
    for line in file.readlines():
        scores.append(line.rstrip(',\n').split(','))
        
with open("../input/hyppr-images-mapping/all_ids.p", "rb") as f:
    all_ids = pickle.load(f)

In [None]:
def set_key(dictionary, key, value):
    if key not in dictionary:
         dictionary[key] = value
    elif type(dictionary[key]) == list:
         dictionary[key].append(value)
    else:
         dictionary[key] = [dictionary[key], value]

In [None]:
labels_n = [" ".join(my_list) for my_list in labels_all]
labels_n = [" ".join(re.split(' |-', my_list)) for my_list in labels_n]

categories_n = [" ".join(my_list) for my_list in categories]
cats = [category[0] for category in categories]

In [None]:
# get the tfidf vectors #
tfidf_vec = TfidfVectorizer(ngram_range = (1, 3))
%time tfidf_vec.fit_transform(labels_n)

word_weight_dict = dict(zip(tfidf_vec.get_feature_names(), tfidf_vec.idf_))

len(tfidf_vec.get_feature_names())

In [None]:
lexvec = gensim.models.KeyedVectors.load_word2vec_format(
                    '../input/lexvec/lexvec.commoncrawl.300d.W.pos.neg3.vectors', binary = False)

# LEXVEC model testing

In [None]:
n = lexvec[0].shape[0]

images_embeddings_lv = []   # aggregating embeddings
count_not_working  = 0

ids_of_zero_vectors = [] # we need to delete zero vectors since we can't categorize it

for idimage in range(len(labels_all)):  # going on lists of words; splitting words then on subwords for aggregating subwords into one word
    agg1 = np.zeros(n)
    weights_sum = 0
    for label in labels_all[idimage]:
        agg = np.zeros(n)
        subwords = re.split(' |--|-', label.replace('&', 'and'))
        if len(subwords) == 1: # no <subword>-<subword>-<subword> words then
            try:
                agg = word_weight_dict[label] * lexvec[label]
                weights_sum += word_weight_dict[label]
            except KeyError as e:            
                count_not_working += 1
                
        else:
            common_weight = 0
            for subw in subwords:    # subw for subword
               
                aggregate_mult = np.zeros(n)
                if subw not in stopwords.words('english') and subw != '':  # there is some NULL strings
                    try:
                        aggregate_mult = word_weight_dict[subw]*lexvec[subw]
                        common_weight += word_weight_dict[subw]
                    except KeyError as e:
                        count_not_working += 1
                        
                agg = np.add(agg, aggregate_mult)
            if common_weight > 1e-6:
                weights_sum += common_weight
                agg = np.divide(agg, common_weight)
            
        agg1 = np.add(agg1, agg)
    if weights_sum > 1e-6:
        m = weights_sum
    else:
        m = len(labels_all[idimage])
        ids_of_zero_vectors.append(idimage)
        
    images_embeddings_lv.append(np.divide(agg1, m) ) #exchange this with dictionary
                                    
images_embeddings_lv = np.array(images_embeddings_lv)
print(count_not_working)

## Aggregating categories embeddings:

In [None]:
with open("../input/hyppr-images-mapping/category_to_urls.p", "rb") as f:
    categories_urls = pickle.load(f)
with open("../input/hyppr-images-mapping/urls_imagelabels.p", "rb") as f:
    urls_imagelabels = pickle.load(f)

In [None]:
n = lexvec[0].shape[0]

Not taking 'dance', 'entertainment', 'tech'.

In [None]:
nodata = ['dance', 'entertainment', 'tech']  # we don't have data on cite for this categories so we can't consider data communicated with it
cats = [cat for cat in cats if cat not in nodata]   # so we just throw it away

In [None]:
cat_embeddings = dict()
numb_of_fails = 0
no_url = 0

for category in cats:
    emb_cat = np.zeros(n)
    numb_of_images = 0
    for image in categories_urls[category]:
        try:
            emb_temp = np.zeros(n)
            distr = urls_imagelabels[image]  # try for this dictionary applying    ## there is spawning a KeyError
            weights_sum = 0
            numb_of_images += 1
            for label in distr:
                label = label[0].lower()
                emb_temp = np.zeros(n)
                subwords = re.split(' |--|-|, ', label)
                
                if len(subwords) == 1: # no <subword>-<subword>-<subword> words then
                    try:
                        emb_temp = word_weight_dict[label] * lexvec[label]
                        weights_sum += word_weight_dict[label]
                    except TypeError as e:
                        print(e)
                        numb_of_fails += 1
                        
                else:
                    common_weight = 0
                    for subw in subwords:    # subw for subword
                        emb_mult = np.zeros(n)
                        try:
                            emb_mult = word_weight_dict[subw]*lexvec[subw]
                            common_weight += word_weight_dict[subw]
                        except:
                            numb_of_fails += 1
                        emb_temp = np.add(emb_temp, emb_mult)
                        
                    if common_weight > 1e-6:
                        weights_sum += common_weight
                        emb_temp = np.divide(emb_temp, common_weight)
                    # there will be a lot of zero values.
                    
            emb_temp = np.divide(emb_temp, weights_sum)
            emb_cat = np.add(emb_cat, emb_temp)
        except KeyError as e:
            no_url += 1
    cat_embeddings[category] = np.divide(emb_cat, numb_of_images) 
print('There is no such url: {}'.format(no_url))
print('Number of fails: {}'.format(numb_of_fails))

In [None]:
with open("./cat_embeddings.p", "wb") as f:
    pickle.dump(cat_embeddings, f)

# Processing the other data which does not occur / almost does not occur on the site

# Quality estimation

In [None]:
dict_embeddings = dict()
for i in range(len(all_ids)):
    dict_embeddings[all_ids[i]] = images_embeddings_lv[i]

In [None]:
with open("./dict_embeddings.p", "wb") as f:
    pickle.dump(dict_embeddings, f)

In [None]:
def set_key(dictionary, key, value):
    if key not in dictionary:
         dictionary[key] = value
    elif type(dictionary[key]) == list:
         dictionary[key].append(value)
    else:
         dictionary[key] = [dictionary[key], value]

In [None]:
import pickle
with open("../input/hyppr-images-mapping/objid_postid.p", "rb") as f:
    objid_postid = pickle.load(f)
# reverse:
postid_objid = dict()
for objid, postid in objid_postid.items():
    set_key(postid_objid, postid, objid)

In [None]:
with open('../input/hyppr-images-mapping/category_to_posts_vision.p', 'rb') as f:   # opening given model
    category_to_posts = pickle.load(f)
category_to_posts = dict(category_to_posts)

posts_to_category = dict()
for cat, posts in category_to_posts.items():
    for post in posts:
        if post not in posts_to_category.keys():
            set_key(posts_to_category, post, cat)
cat_emb = list(cat_embeddings.values())

In [None]:
nodata = ['dance', 'entertainment', 'tech']  # we don't have data on cite for this categories so we can't consider data communicated with it
cats = [cat[0] for cat in categories if cat[0] not in nodata]   # so we just throw it away
cats.append('none')

In [None]:
len(cats)

In [None]:
with open("../input/hyppr-images-mapping/urls_imagelabels.p", "rb") as f:
    urls_imagelabels = pickle.load(f)
with open("../input/hyppr-images-mapping/postid_objurls.p", "rb") as f:
    postid_objurls = pickle.load(f)

all_image_urls = []
all_image_labels = []
for imageUrl, labels in urls_imagelabels.items():
    all_image_labels.append([label[0].lower() for label in labels if label[0] != []])
    all_image_urls.append(imageUrl)

In [None]:
threshold = 0.5
yDef,yNew = [], []
no_post, no_emb, no_labels = 0, 0, 0
arguable_data = dict()
sec_dict_values = dict()
cos_similarities = dict()
for post, cat in posts_to_category.items():
    no_data = False
    if post in postid_objid.keys():        
        objid = postid_objid[post]
        if objid in dict_embeddings.keys():
            maxId = cosine_similarity(cat_emb, [dict_embeddings[objid]]).argmax()
            if max(cosine_similarity(cat_emb, [dict_embeddings[objid]])) < threshold:    # chanching threshold for better results
                yNew.append('none')
            else:
                cat_res = cats[maxId]
                yNew.append(cat_res)
        else:
            no_emb += 1
            no_data = True
    else:
        no_post += 1
        no_data = True
        
    if no_data is False:
        if type(cat) is list:
            yDef.append(cat[0])
        else:
            yDef.append(cat)
            
        objid = postid_objid[post]   
        if yNew[-1] != yDef[-1]:   # creating distributions on cosine_similarities for arguable_data
            url = postid_objurls[post]
            if type(url) == list:
                url = url[0]
            set_key(cos_similarities, cat, cosine_similarity(cat_emb, [dict_embeddings[objid]]) )
            set_key(arguable_data, cat, url)
            set_key(sec_dict_values, cat, yNew[-1])

print('There is {} post missing'.format(no_post))
print('There is {} embeddings missing'.format(no_emb))
print('There is {} post to labels missing'.format(no_labels))

In [None]:
with open("../input/hyppr-images-mapping/objid_mark_category.p", "rb") as f:
       objid_mark_category = pickle.load(f)
with open("../input/hyppr-images-mapping/yRight.p", "rb") as f:
       yRight = pickle.load(f)

In [None]:
len(yDef) == len(yNew)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
conf_matr = confusion_matrix(yRight, yNew, labels = cats)

df_cm = pd.DataFrame(conf_matr, index = [i for i in range(len(cats))],
                  columns = [i for i in range(len(cats))])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm,cmap="YlGnBu",linewidths=1, annot=True, fmt = 'd')
plt.savefig('word2vecQuality.png', bbox_inches = 'tight')

In [None]:
thresholds = np.arange(0,1,0.1)
f_1_scores_new, prec_scores, rec_scores = [], [], []
for thresh in thresholds:
    threshold = thresh
    yDef,yNew = [], []
    for post, cat in posts_to_category.items():
        no_data = False
        if post in postid_objid.keys():        
            objid = postid_objid[post]
            if objid in dict_embeddings.keys():
                maxId = cosine_similarity(cat_emb, [dict_embeddings[objid]]).argmax()
                if max(cosine_similarity(cat_emb, [dict_embeddings[objid]])) < threshold:    # chanching threshold for better results
                    yNew.append('none')
                else:
                    cat_res = cats[maxId]
                    yNew.append(cat_res)
            else:
                no_data = True
        else:
            no_data = True

        if no_data is False:
            if type(cat) is list:
                yDef.append(cat[0])
            else:
                yDef.append(cat)
    prec_scores.append(precision_score(yRight,yNew, labels = cats, average = 'macro'))
    rec_scores.append(recall_score(yRight,yNew, labels = cats, average = 'macro'))
    f_1_scores_new.append(f1_score(yRight,yNew, labels = cats, average = 'macro'))

In [None]:
fig, ax = plt.subplots(1, 3, sharey = True, figsize = (16, 7))
ax[0].plot(thresholds, prec_scores, 'yx-')
ax[0].set_xlabel('t')
ax[0].set_ylabel('precision')
ax[1].plot(thresholds, rec_scores, 'rx-')
ax[1].set_xlabel('t')
ax[1].set_ylabel('recall')
ax[2].plot(thresholds, f_1_scores_new, 'gx-')
ax[2].set_xlabel('t')
ax[2].set_ylabel('$F_1$-score')
plt.savefig('w2vQual.png', bbox_inches='tight')

In [None]:
threshold = 0.5
yDef,yNew = [], []
no_post, no_emb, no_labels = 0, 0, 0
arguable_data = dict()
sec_dict_values = dict()
cos_similarities = dict()
for post, cat in posts_to_category.items():
    no_data = False
    if post in postid_objid.keys():        
        objid = postid_objid[post]
        if objid in dict_embeddings.keys():
            maxId = cosine_similarity(cat_emb, [dict_embeddings[objid]]).argmax()
            if max(cosine_similarity(cat_emb, [dict_embeddings[objid]])) < threshold:    # chanching threshold for better results
                yNew.append('none')
            else:
                cat_res = cats[maxId]
                yNew.append(cat_res)
        else:
            no_emb += 1
            no_data = True
    else:
        no_post += 1
        no_data = True
        
    if no_data is False:
        if type(cat) is list:
            yDef.append(cat[0])
        else:
            yDef.append(cat)
            
        objid = postid_objid[post]   
        if yNew[-1] != yDef[-1]:   # creating distributions on cosine_similarities for arguable_data
            url = postid_objurls[post]
            if type(url) == list:
                url = url[0]
            set_key(cos_similarities, cat, cosine_similarity(cat_emb, [dict_embeddings[objid]]) )
            set_key(arguable_data, cat, url)
            set_key(sec_dict_values, cat, yNew[-1])
            
conf_matr = confusion_matrix(yRight, yNew, labels = cats)

df_cm = pd.DataFrame(conf_matr, index = [i for i in range(len(cats))],
                  columns = [i for i in range(len(cats))])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm,cmap="YlGnBu",linewidths=1, annot=True, fmt = 'd')
plt.savefig('word2vecQuality.png', bbox_inches = 'tight')

In [None]:
f1_score(yRight,yNew, labels = cats, average = 'macro')

In [None]:
recall_score(yRight,yNew, labels = cats, average = 'macro')

In [None]:
precision_score(yRight,yNew, labels = cats, average = 'macro')

# Final categorization of all posts:

In [None]:
result = dict()
threshold = 0.3
for postid, objid in postid_objid.items():        
    objid = postid_objid[post]
    if objid in dict_embeddings.keys():
        maxId = cosine_similarity(cat_emb, [dict_embeddings[objid]]).argmax()
        if max(cosine_similarity(cat_emb, [dict_embeddings[objid]])) < threshold:    # chanching threshold for better results
            set_key(result, postid, 'none')
        else:
            cat_res = cats[maxId]
            set_key(result, postid, cat_res)
    else:
        no_emb += 1
        
print('There is {} embeddings missing'.format(no_emb))

In [None]:
with open("./postid_category.p", "wb") as f:
     pickle.dump(result, f)