In [2]:
# coding: utf-8

import argparse
from os.path import join
from time import time

import pandas as pd
from numpy import sqrt, newaxis, dot
from gensim import matutils
from gensim.models import Word2Vec, Doc2Vec

from train_w2v import EpochLogger, EpochSaver
from constants import ETL_PATH, DATASETS
from utils import tprint
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
pd.options.display.max_rows = 2001

In [4]:
def load_embeddings(d2v_path, w2v_path):
    print("Doc2Vec loading", d2v_path)
    d2v = Doc2Vec.load(d2v_path)
    print('vocab size:', len(d2v.wv.vocab))
    print('docvecs size:', len(d2v.docvecs.vectors_docs))

    print("Word2Vec loading", w2v_path)
    w2v = Word2Vec.load(w2v_path)
    print('vocab size:', len(w2v.wv.vocab))

    return d2v, w2v

In [5]:
def get_phrases(max_title_length, min_doc_length, lemmatized_only=True):
    dewiki_phrases_lemmatized = 'dewiki_phrases_lemmatized.pickle'
    phrases = pd.read_pickle(join(ETL_PATH, dewiki_phrases_lemmatized))
    # creating a list containing original and lemmatized phrases
    phrases = phrases.query(f"doc_len >= {min_doc_length} and title_len <= {max_title_length}")
    if lemmatized_only:
        phrases = phrases.token.unique()
    else:
        phrases = phrases.token.append(phrases.text).unique()
    return phrases

In [6]:
def get_indices(d2v, w2v, max_title_length=4, min_doc_length=40):
    phrases = get_phrases(max_title_length=max_title_length, min_doc_length=min_doc_length)
    d_indices = []
    w_indices = []
    dout = wout = 0
    for label in phrases:
        try:
            idx = d2v.docvecs.doctags[label].offset
            d_indices.append(idx)
        except:
            dout += 1
        try:
            idx = w2v.wv.vocab[label].index
            w_indices.append(idx)
        except:
            wout += 1
    return d_indices, w_indices

In [7]:
def index_embeddings(d2v, w2v, d_indices, w_indices):
    """
    Modifies the argument models. Normalizes the d2v und w2v vectors.
    Also reduces the number of d2v docvecs.
    """
    d_indices = list(set(d_indices))
    w_indices = list(set(w_indices))
    print('d2v indices', len(d_indices))
    print('w2v indices', len(w_indices))

    # Models normalised in unit vectord from the indices given above in pickle files.
    d2v.wv.vectors_norm = (
        d2v.wv.vectors / sqrt((d2v.wv.vectors ** 2).sum(-1))[..., newaxis]
    )
    d2v.docvecs.vectors_docs_norm = (
        d2v.docvecs.vectors_docs / sqrt((d2v.docvecs.vectors_docs ** 2).sum(-1))[..., newaxis]
    )[d_indices]
    print('d2v.wv.vectors_norm', len(d2v.wv.vectors_norm))
    print('d2v.docvecs.vectors_docs_norm', len(d2v.docvecs.vectors_docs_norm))
    print("doc2vec normalized")

    w2v.wv.vectors_norm = (
        w2v.wv.vectors / sqrt((w2v.wv.vectors ** 2).sum(-1))[..., newaxis]
    )
    w2v_indexed = w2v.wv.vectors_norm[w_indices]
    print('w2v.wv.vectors_norm', len(w2v.wv.vectors_norm))
    print('w2v_indexed', len(w2v_indexed))
    print("word2vec normalized")
    return w2v_indexed

In [8]:
def load_topics(topics_path, metrics, params, nbtopics, print_sample=False):
    print("Loading topics", topics_path)
    topics = (
        pd
        .read_csv(topics_path, index_col=[0, 1, 2, 3, 4])
        .query('metric in @metrics and param_id in @params and nb_topics in @nbtopics')
        .reset_index(drop=True)
    )
    if print_sample:
        tprint(topics)
    else:
        print('number of topics', len(topics))
    return topics

In [14]:
dataset = DATASETS['O']
nbfiles = None
nbfiles_str = f'_{args.nbfiles:02d}' if nbfiles else ''
version = 'noun'
topics_file = join(ETL_PATH, 'LDAmodel', version, 'Reranker', f'{dataset}{nbfiles_str}_topic-candidates.csv')
labels_file = join(ETL_PATH, 'LDAmodel', version, 'Reranker', f'{dataset}{nbfiles_str}_label-candidates.csv')
emb_path = join(ETL_PATH, 'embeddings')
d2v_path = join(emb_path, 'd2v', 'd2v')
w2v_path = join(emb_path, 'w2v', 'w2v')
metrics = ['ref']
params = ['e42']
nb_topics = [10]
max_title_length = 4
min_doc_length = 40
nb_labels = 20

In [15]:
topics = load_topics(topics_file, metrics, params, nb_topics, print_sample=True)

Loading topics ../data/preprocessed/LDAmodel/noun/Reranker/OnlineParticipation_topic-candidates.csv
|    | term0      | term1         | term2              | term3          | term4       | term5      | term6        | term7       | term8         | term9         |
|---:|:-----------|:--------------|:-------------------|:---------------|:------------|:-----------|:-------------|:------------|:--------------|:--------------|
|  0 | Mensch     | Müll          | Köln               | Gebäude        | Jahr        | Stadt      | Mülleimer    | Leute       | Bad_Godesberg | Mülheim       |
|  1 | Kind       | Schule        | Jugendliche        | Vorschlag      | gut_Idee    | Elter      | Kita         | Idee        | Jahr          | Mensch        |
|  2 | Vorschlag  | Beitrag       | Dank               | Kategorie      | Verständnis | Moderation | Thema        | Auswertung  | Hallo         | Themenbereich |
|  3 | Radfahrer  | Radweg        | Auto               | Richtung       | Straße      | Fu

In [None]:
d2v, w2v = load_embeddings(d2v_path, w2v_path)
d_indices, w_indices = get_indices(d2v, w2v, max_title_length, min_doc_length)
w2v_indexed = index_embeddings(d2v, w2v, d_indices, w_indices)

In [17]:
def get_labels(topic, nb_labels, d2v, w2v, w2v_indexed, d_indices, w_indices):
    # TODO: simplify
    topic_len = len(topic)

    val_d2v = 0.0
    val_w2v = 0.0
    store_indices = []
    for term in topic:
        try:
            # The word2vec value of topic word from doc2vec trained model
            temp_d2v = d2v.wv.vectors_norm[d2v.wv.vocab[term].index]
        except KeyError:
            pass
        else:
            # Getting the unit vector
            mean_d2v = matutils.unitvec(temp_d2v)
            # The dot product of all labels in doc2vec with the unit vector of topic word
            dists_d2v = dot(d2v.docvecs.vectors_docs_norm, mean_d2v)
            val_d2v += dists_d2v

        try:
            temp_w2v = w2v.wv.vectors_norm[w2v.wv.vocab[term].index]
            # The word2vec value of topic word from word2vec trained model
        except KeyError:
            pass
        else:
            # Unit vector
            mean_w2v = matutils.unitvec(temp_w2v)
            # dot product of all possible labels in word2vec vocab with the unit vector of the topic term
            dists_w2v = dot(w2v_indexed, mean_w2v)
            """
            This next section of code checks if the topic word is also a potential label in trained 
            word2vec model. If that is the case, it is important the dot product of label with that 
            topic word is not taken into account.Hence we make that zero and further down the code 
            also exclude it in taking average of that label over all topic words. 
            """
            if w2v.wv.vocab[term].index in w_indices:
                i_val = w_indices.index(w2v.wv.vocab[term].index)
                store_indices.append(i_val)
                dists_w2v[i_val] = 0.0
            val_w2v += dists_w2v

    # Give the average vector over all topic words
    avg_d2v = val_d2v / topic_len
    avg_w2v = val_w2v / topic_len

    # This modifies the average w2v vector for cases in which the w2v label was same as topic term.
    for element in store_indices:
        avg_w2v[element] = (avg_w2v[element] * topic_len) / (topic_len - 1)

    # argsort and get top 100 doc2vec label indices
    best_d2v = matutils.argsort(avg_d2v, topn=100, reverse=True)
    best_w2v = matutils.argsort(avg_w2v, topn=100, reverse=True)

    result_d2v = []
    # Get the doc2vec labels from indices
    for element in best_d2v:
        ind = d_indices[element]
        temp = d2v.docvecs.index_to_doctag(ind)
        result_d2v.append((temp, float(avg_d2v[element])))

    # Get the word2vec labels from indices
    result_w2v = []
    for element in best_w2v:
        ind = w_indices[element]
        temp = w2v.wv.index2word[ind]
        result_w2v.append((temp, float(avg_w2v[element])))

    # Get the combined set of both doc2vec labels and word2vec labels
    comb_labels = list(set([j[0] for j in result_d2v] + [k[0] for k in result_w2v]))

    # Get indices from combined labels
    newlist_d2v = []
    newlist_w2v = []
    for word in comb_labels:
        try:
            newlist_d2v.append(d_indices.index(d2v.docvecs.doctags[word].offset))
            newlist_w2v.append(w_indices.index(w2v.wv.vocab[word].index))
        except:
            pass
    newlist_d2v = list(set(newlist_d2v))
    newlist_w2v = list(set(newlist_w2v))

    # Finally again get the labels from indices. We searched for the score from both d2v and w2v models.
    resultlist_d2v_new = [
        (d2v.docvecs.index_to_doctag(d_indices[elem_]), float(avg_d2v[elem_]))
        for elem_ in newlist_d2v
    ]
    resultlist_w2v_new = [
        (w2v.wv.index2word[w_indices[elem_]], float(avg_w2v[elem_]))
        for elem_ in newlist_w2v
    ]

    # Finally get the combined score with the label. The label used will be of doc2vec not of word2vec.
    new_score = []
    for term in resultlist_w2v_new:
        k, v = term
        for elem in resultlist_d2v_new:
            k2, v2 = elem
            if k == k2:
                v3 = v + v2
                new_score.append((k2, v3))
    new_score = sorted(new_score, key=lambda x: x[1], reverse=True)[:nb_labels]
    print(new_score)
    print(type(new_score))
    print(len(new_score))
    return new_score

In [18]:
%%time
labels = topics.progress_apply(
    lambda row: get_labels(row, nb_labels, d2v, w2v, w2v_indexed, d_indices, w_indices),
    axis=1
)
labels
#print("Writing labels to", labels_file)
#labels.to_csv(labels_file)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

  if np.issubdtype(vec.dtype, np.int):


[('Cazères', 0.4802137017250061), ('Urban_Kreutzbach', 0.4747736304998398), ('Friedrich_zu_Solms-Baruth', 0.4682406485080719), ('Heideblick', 0.4675930589437485), ('The_Walk', 0.44548118114471436), ('James_Franck', 0.42924121022224426), ('Rezeptor', 0.41731926798820496), ('Rodlera', 0.41555921733379364), ('Willamette_Valley', 0.4098128154873848), ('Rhône', 0.4093203693628311), ('Treverer', 0.407368004322052), ('Holzgerlingen', 0.40681494772434235), ('Hans_Wolff', 0.39921577274799347), ('Willmann', 0.39887455850839615), ('Dolní_Paseky', 0.396098792552948), ('Gänsesäger', 0.3910842463374138), ('Free_Austrian_Movement', 0.3906186372041702), ('Step_BY_Step', 0.3888951689004898), ('Schwimmweltmeisterschaft', 0.3881979286670685), ('Lioba', 0.3858857601881027)]
<class 'list'>
20
[('Fulleren', 0.49891693890094757), ('1510', 0.49438878893852234), ('Konrad_Duden', 0.4876597076654434), ('astronomisch_Objekt', 0.4857056029140949), ('Kompaktkassette', 0.4844680279493332), ('Tozama-Daimyō', 0.481676

In [22]:
pd.DataFrame.from_records(list(labels))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,"(Cazères, 0.4802137017250061)","(Urban_Kreutzbach, 0.4747736304998398)","(Friedrich_zu_Solms-Baruth, 0.4682406485080719)","(Heideblick, 0.4675930589437485)","(The_Walk, 0.44548118114471436)","(James_Franck, 0.42924121022224426)","(Rezeptor, 0.41731926798820496)","(Rodlera, 0.41555921733379364)","(Willamette_Valley, 0.4098128154873848)","(Rhône, 0.4093203693628311)","(Treverer, 0.407368004322052)","(Holzgerlingen, 0.40681494772434235)","(Hans_Wolff, 0.39921577274799347)","(Willmann, 0.39887455850839615)","(Dolní_Paseky, 0.396098792552948)","(Gänsesäger, 0.3910842463374138)","(Free_Austrian_Movement, 0.3906186372041702)","(Step_BY_Step, 0.3888951689004898)","(Schwimmweltmeisterschaft, 0.3881979286670685)","(Lioba, 0.3858857601881027)"
1,"(Fulleren, 0.49891693890094757)","(1510, 0.49438878893852234)","(Konrad_Duden, 0.4876597076654434)","(astronomisch_Objekt, 0.4857056029140949)","(Kompaktkassette, 0.4844680279493332)","(Tozama-Daimyō, 0.4816760718822479)","(Tulpenmanie, 0.4772815927863121)","(Balearische_Insel, 0.4752257317304611)","(Gerhard_Berger, 0.475117564201355)","(Bell_Media, 0.47391198575496674)","(Freiberg, 0.46770521998405457)","(Amage, 0.4656752943992615)","(TOM_Stafford, 0.46256110072135925)","(Verkehrsclub_Deutschland, 0.4623275101184845)","(Freedom, 0.4619446247816086)","(Pozorka, 0.46103082597255707)","(Glücksspiel, 0.45948904752731323)","(Bintang, 0.45763690769672394)","(Sox, 0.4568983316421509)","(Kombucha, 0.4551253914833069)"
2,"(Nebenschilddrüse, 0.5348938405513763)","(Christentum, 0.5268805474042892)","(Häufigkeitsklasse, 0.5150118470191956)","(Pleigne, 0.5137563794851303)","(Kompaktkassette, 0.5119221806526184)","(Macunaíma, 0.5068587362766266)","(Holland, 0.49850650131702423)","(Revolt, 0.49486738443374634)","(Kunsthalle_Bremen, 0.49243123829364777)","(Ausspähen_von_Daten, 0.49140915274620056)","(Jörg_Sander, 0.4853502959012985)","(Valbirse, 0.4799789637327194)","(London_Tower, 0.46768319606781006)","(Grodzisk_Wielkopolski, 0.4640638679265976)","(Hemmoorer_Eimer, 0.4625903367996216)","(Tai-Volk, 0.46220996230840683)","(Insektenstich, 0.45797084271907806)","(John_Coltrane, 0.4548836350440979)","(Kabire, 0.4541536271572113)","(Patch, 0.4520420730113983)"
3,"(Lindewerra, 0.6244032382965088)","(Segelschiff, 0.6140712052583694)","(Schloss_Remplin, 0.6107631325721741)","(Inkommensurabilität, 0.6034567952156067)","(Wolframcarbid, 0.6033321470022202)","(TOM_Taylor, 0.5975641012191772)","(Schaufenberg, 0.5910590440034866)","(Presskuchen, 0.5831977427005768)","(Trüebsee, 0.5830600261688232)","(Datenkompression, 0.5816420689225197)","(graphisch_Viertel, 0.5806907564401627)","(Hausdorff-Dimension, 0.5745630264282227)","(Proschim, 0.5711553692817688)","(Judaismus, 0.5709549635648727)","(Hans_Geiger, 0.5666165351867676)","(Baltikum, 0.5629711747169495)","(Avenbach, 0.5628282949328423)","(Tv-Media, 0.561182513833046)","(Artois, 0.5606314986944199)","(Montgomery_County, 0.5601844638586044)"
4,"(Freiberg, 0.4835883229970932)","(Kanton_Sartenais-Valinco, 0.4835146814584732)","(Cazères, 0.456076055765152)","(Balearische_Insel, 0.45332901924848557)","(Verkehrsclub_Deutschland, 0.4502774626016617)","(Bintang, 0.44587622582912445)","(Pozorka, 0.4457913935184479)","(Galleria_d’Arte_Moderna, 0.4430612847208977)","(Tulpenmanie, 0.44289083033800125)","(Emirau, 0.4376486837863922)","(Bayer_1, 0.43743516504764557)","(Die_Kammer, 0.43715204298496246)","(Montigny-lès-Metz, 0.435620054602623)","(Flaneur, 0.4344683289527893)","(John_Stuart, 0.4341833293437958)","(Amplitude, 0.43273258209228516)","(Günzenhausen, 0.4313894435763359)","(Arbeiterpartei_Kurdistan, 0.42859384417533875)","(österreichisch_Zeitung, 0.4275805503129959)","(Villa_Flora, 0.42575572431087494)"
5,"(Marinekorps_Flandern, 0.5894676744937897)","(Gustav-Hertz-Preis, 0.5837655365467072)","(Tillyschanze, 0.5576960146427155)","(Oettinger_Brauerei, 0.553833931684494)","(Montgomery_County, 0.5401354283094406)","(Datenkompression, 0.5340378060936928)","(Heracles_Almelo, 0.5326370000839233)","(Cuno_Hoffmeister, 0.5304832383990288)","(Missouri_River_Township, 0.5297190546989441)","(Baltikum, 0.5270441174507141)","(DIN_1310, 0.5262212455272675)","(Segelschiff, 0.5184870362281799)","(François_Blondel, 0.5176179707050323)","(Umerziehung_durch_Arbeit, 0.5147275030612946)","(Alleghe, 0.5125963240861893)","(Wolframcarbid, 0.5093449652194977)","(philosophisch_Jahrbuch, 0.5065541714429855)","(Proschim, 0.5038587599992752)","(Günter_Schlegel, 0.501641720533371)","(Avenbach, 0.5012599900364876)"
6,"(Cazères, 0.5013900548219681)","(positiv_Matrix, 0.4716911315917969)","(Crillon, 0.45845499634742737)","(Niederlangenbach, 0.45116472989320755)","(Fredrik_Lindström, 0.44592034816741943)","(Lambertsberg, 0.44443798810243607)","(Pif_Gadget, 0.425390288233757)","(Emirau, 0.4241499751806259)","(Muslim-Markt, 0.4169306084513664)","(Macintosh, 0.4103366658091545)","(Tato, 0.4096692055463791)","(Rotensee, 0.4085226356983185)","(Zentral-_und_Landesbibliothek_Berlin, 0.40245...","(Jupiter, 0.39841228723526)","(Heiliges_römisch_Reich, 0.397032655775547)","(Monte_Cristo, 0.39699113368988037)","(Michaëlla_Krajicek, 0.3933965265750885)","(Arbeitskampf, 0.393042728304863)","(Heideblick, 0.389748215675354)","(John_Herschel, 0.38783539831638336)"
7,"(Dättwil, 0.5535447895526886)","(James_Franck, 0.5430155098438263)","(Tillyschanze, 0.5415567755699158)","(Agrianes, 0.5324253141880035)","(Pharmakologie, 0.5295806974172592)","(Nightlife, 0.5191638916730881)","(Weigand_von_Redwitz, 0.5141817033290863)","(Lindewerra, 0.5128759145736694)","(Uhland, 0.5092276930809021)","(Terry_Gilliam, 0.5067149996757507)","(Terzaghi_Lecture, 0.5052666217088699)","(Tuchhalle, 0.48802027106285095)","(variabel_Kosten, 0.48781777173280716)","(Glossar, 0.48306943476200104)","(Rodlera, 0.480293944478035)","(Burgalb, 0.47990404069423676)","(Umerziehung_durch_Arbeit, 0.4794326275587082)","(Progerie, 0.4787616729736328)","(atlantisch_Ozean, 0.4709400087594986)","(Flussdelta, 0.4692968279123306)"
8,"(Marinekorps_Flandern, 0.46834635734558105)","(Inkommensurabilität, 0.46695709228515625)","(Madentherapie, 0.4530021846294403)","(Bezirk_Hamburg-Nord, 0.4473436325788498)","(Aurealis_Award, 0.4429512694478035)","(Dungeon_Master, 0.43968968093395233)","(Mount_Gambier, 0.4366196542978287)","(Hausdorff-Dimension, 0.43433863669633865)","(golden_Leopard, 0.43350696563720703)","(Kesseltreibe, 0.4278239607810974)","(Transrapid, 0.42769068479537964)","(Lindewerra, 0.4225156754255295)","(Heracles_Almelo, 0.42163340747356415)","(Nightlife, 0.42131784558296204)","(Okiya, 0.4212995022535324)","(Visual_Art_’s, 0.4179495871067047)","(Oráčov, 0.4149642288684845)","(Augustin_von_Balthasar, 0.4079091027379036)","(Produkthaftung, 0.4054200053215027)","(Tromøy, 0.40419207513332367)"
9,"(philosophisch_Jahrbuch, 0.5329182147979736)","(Dammriss, 0.5276550054550171)","(Übersetzen, 0.5126512944698334)","(Tillyschanze, 0.5036217570304871)","(DIN_1310, 0.48904527723789215)","(Knox-Küste, 0.4841541349887848)","(Tobias_Neumann, 0.48395219445228577)","(Pinus_patula, 0.4802192747592926)","(Krasnaja_Poljana, 0.4754106104373932)","(Johann_Pachelbel, 0.4717692881822586)","(Corbenay, 0.4653577655553818)","(Kraweel, 0.4642474800348282)","(New_Haven_Colony, 0.461770661175251)","(Transrapid, 0.46128834784030914)","(Ganzheit, 0.46066565811634064)","(Columban_von_Luxeuil, 0.45411957055330276)","(Strahl, 0.45190832018852234)","(Kesseltreibe, 0.4499427005648613)","(30_Rock, 0.4475810378789902)","(Hanse, 0.44599615037441254)"
