# Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import pairwise_distances
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import os
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
import nltk
import string

def lower(word):
    return word.lower()

lemm = nltk.stem.WordNetLemmatizer()
def lemmatize(word):
    return lemm.lemmatize(word)

punct = set(string.punctuation)
def no_punctuation(word):
    return ''.join(c for c in word if c not in punct)

stop_words = set(nltk.corpus.stopwords.words("english"))
def no_stop_words(word):
    return word if word not in stop_words else ''

strategy_map = {'lo':lower,'lem':lemmatize,
                'punct':no_punctuation,'stop':no_stop_words}

def preprocess(docs,strategies):
    for strategy in strategies:
        new_docs = []
        for doc in docs:
            new_doc = []
            for word in doc:
                transformed = strategy_map[strategy](word)
                if transformed:
                    new_doc.append(transformed)
            new_docs.append(new_doc)
        docs = new_docs
    return docs

In [3]:
documents_train = [] ## list of word
for i in range(10000):
    with open('../descriptions_train/%d.txt' % (i,)) as f:
        documents_train.append(f.read().split())
documents_train = preprocess(documents_train,['lo','punct'])

In [4]:
tag_docs_train = [] ## list of word
for i in range(10000):
    with open('../tags_train/%d.txt' % (i,),'r') as f:
        tag_docs_train.append([word for line in f.read().split('\n') for word in line.split(':') if word])

tag_docs_train = preprocess(tag_docs_train,['lo','punct'])

In [5]:
def get_prediction(vecs,pics):
    dists = pairwise_distances(vecs,pics,metric='cosine')
    return dists.argsort(1)

def evaluate(preds):
    ranks = [np.argwhere(vec==i)[0][0] for i,vec in enumerate(preds)]
    map_20 = np.mean([(20-rank)/20 if rank<20 else 0 for rank in ranks])
    ave_rank = np.mean(ranks)
    return map_20, ave_rank

# Tfidf

In [11]:
tfidf = TfidfVectorizer(stop_words=stop_words,min_df=2)

In [13]:
tfidf.fit([' '.join(doc) for doc in documents_train])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [14]:
vectors_desc = tfidf.transform([' '.join(doc) for doc in documents_train])

In [15]:
vectors_desc.shape

(10000, 5508)

In [16]:
vectors_tags = tfidf.transform([' '.join(doc) for doc in tag_docs_train])

In [17]:
vectors_tags.shape

(10000, 5508)

In [18]:
preds = get_prediction(vectors_desc,vectors_tags)

In [19]:
evaluate(preds)

(0.09830499999999999, 824.3905)

# Gensim Doc2Vec

In [39]:
gensim_docs = []
for i in range(10000):
    gensim_docs.append(TaggedDocument([word for word in documents_train[i] if word not in stop_words],[i]))

In [40]:
model = Doc2Vec(gensim_docs, vector_size=300, window=8, 
                min_count=2, workers=4)

In [41]:
model.train(gensim_docs,total_examples=model.corpus_count,epochs=80)

In [42]:
vectors_desc = np.array([model[i] for i in range(10000)])

In [43]:
vectors_tags = np.zeros((10000,300))
for i in range(10000):
    vectors_tags[i] = model.infer_vector([word for word in tag_docs_train[i] if word not in stop_words])

In [44]:
preds = get_prediction(vectors_desc,vectors_tags)

In [45]:
evaluate(preds)

(0.026189999999999998, 2642.3745)

## Bert https://readthedocs.org/projects/bert-as-service/downloads/pdf/latest/

In [8]:
from bert_serving.client import BertClient

In [10]:
bc = BertClient('54.159.74.226')

In [54]:
documents_train = [] ## reprocess for bert
for i in range(10000):
    with open('../descriptions_train/%d.txt' % (i,)) as f:
        documents_train.append(' '.join(s.replace('.','').strip().lower() for s in f.read().split('\n')))
        
tag_docs_train = [] ## reprocess for bert
for i in range(10000):
    with open('../tags_train/%d.txt' % (i,),'r') as f:
        tag_docs_train.append(' '.join([word.lower().strip()
                               for line in f.read().split('\n')
                               for word in line.split(':') if word]))

In [56]:
vectors_desc = bc.encode(documents_train)

In [43]:
no_tags = set(i for i in range(10000) if not tag_docs_train[i])

In [50]:
res = bc.encode([tag_docs_train[i] for i in range(10000) if i not in no_tags])

In [51]:
vectors_tags = np.zeros((10000,1024))
counter = 0
for i in range(10000):
    if i not in no_tags:
        vectors_tags[i] = res[counter]
        counter += 1   

In [57]:
preds = get_prediction(vectors_desc,vectors_tags)

In [58]:
evaluate(preds)

(0.017285, 3296.9714)

In [60]:
documents_test = [] ## reprocess for bert
for i in range(2000):
    with open('../descriptions_test/%d.txt' % (i,)) as f:
        documents_test.append(' '.join(s.replace('.','').strip().lower() for s in f.read().split('\n')))
        
tag_docs_test = [] ## reprocess for bert
for i in range(2000):
    with open('../tags_test/%d.txt' % (i,),'r') as f:
        tag_docs_test.append(' '.join([word.lower().strip()
                               for line in f.read().split('\n')
                               for word in line.split(':') if word]))

In [61]:
vectors_desc_test = bc.encode(documents_test)

In [62]:
no_tags_test = set(i for i in range(2000) if not tag_docs_test[i])
res = bc.encode([tag_docs_test[i] for i in range(2000) if i not in no_tags_test])
vectors_tags_test = np.zeros((2000,1024))
counter = 0
for i in range(2000):
    if i not in no_tags_test:
        vectors_tags_test[i] = res[counter]
        counter += 1   

In [63]:
np.save('bert_desc_train',vectors_desc)
np.save('bert_tags_train',vectors_tags)
np.save('bert_desc_test',vectors_desc_test)
np.save('bert_tags_test',vectors_tags_test)

## Glove

In [6]:
word_dict = {}
with open('glove.6B/glove.6B.300d.txt','r') as f:
    for line in f.read().split('\n'):
        if not line: continue
        splitted = line.split()
        word = splitted[0]
        vector = np.array([float(n) for n in splitted[1:]])
        assert len(vector)==300
        word_dict[word] = vector

IndexError: list index out of range

In [8]:
len(word_dict)

400000

In [19]:
in_word_dict = set()
all_words = set(word for corpus in [documents_train,tag_docs_train] for doc in corpus for word in doc)
for word in all_words:
    if word in word_dict:
        in_word_dict.add(word)

In [20]:
len(in_word_dict)

9031

In [21]:
len(all_words)

9876

In [35]:
vectors_desc = np.zeros((10000,300))
for i in range(10000):
    words = [word for word in documents_train[i] if word not in stop_words]
    some_word = False
    for word in words:
        if word in word_dict:
            some_word = True
    if not some_word:
        continue
    doc_matrix = np.array([word_dict[word] for word in words if word in word_dict])
    vectors_desc[i] = doc_matrix.mean(0)

In [36]:
vectors_tags = np.zeros((10000,300))
for i in range(10000):
    words = [word for word in tag_docs_train[i] if word not in stop_words]
    some_word = False
    for word in words:
        if word in word_dict:
            some_word = True
    if not some_word:
        continue
    doc_matrix = np.array([word_dict[word] for word in words if word in word_dict])
    vectors_tags[i] = doc_matrix.mean(0)

In [37]:
preds = get_prediction(vectors_desc,vectors_tags)

In [38]:
evaluate(preds)

(0.086125, 928.6632)