In [1]:
# coding: utf-8

import argparse
import pickle
import re
from os.path import join

import pandas as pd
from numpy import dot, float32 as REAL, sqrt, newaxis
from gensim import matutils
from gensim.models import Word2Vec, Doc2Vec

from train_utils import init_logging
from constants import ETL_PATH, DATASETS
from tqdm import tqdm

pd.options.display.max_rows = 200
tqdm.pandas()

In [22]:
LOG = None


def get_indices(d2v, w2v, max_title_length=4, min_doc_length=41):
    phrases = get_phrases(max_title_length=max_title_length, min_doc_length=min_doc_length)
    d2v_indices = []
    w2v_indices = []
    dout = wout = 0
    for label in phrases:
        try:
            idx = d2v.docvecs.doctags[label].offset
            d2v_indices.append(idx)
        except:
            dout += 1
        try:
            idx = w2v.wv.vocab[label].index
            w2v_indices.append(idx)
        except:
            wout += 1
    return d2v_indices, w2v_indices


def index_embeddings(d2v, w2v, d2v_indices, w2v_indices):
    """
    Modifies the argument models. Normalizes the d2v und w2v vectors.
    Also reduces the number of d2v docvecs.
    """
    # Models normalised in unit vectord from the indices given above in pickle files.
    d2v.wv.syn0norm = (d2v.wv.syn0 / sqrt((d2v.wv.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
    d2v.docvecs.vectors_docs_norm = \
        (d2v.docvecs.doctag_syn0 / sqrt((d2v.docvecs.doctag_syn0 ** 2).sum(-1))[..., newaxis]) \
        .astype(REAL)[d2v_indices]
    LOG.info("doc2vec normalized")

    w2v.wv.syn0norm = (w2v.wv.syn0 / sqrt((w2v.wv.syn0 ** 2).sum(-1))[..., newaxis]).astype(
        REAL)
    w2v_indexed = w2v.wv.syn0norm[w2v_indices]
    LOG.info("word2vec normalized")
    return w2v_indexed


def load_topics(topics_path, metrics, params, nbtopics, print_sample=False):
    LOG.info(f'Loading topics {topics_path}')
    # TODO !!!!!!!!!!!!
    topics = pd.read_csv(topics_path, index_col=None)
    if metrics and 'metric' in topics.columns:
        topics = topics[topics.metric.isin(metrics)]
    if params and 'param_id' in topics.columns:
        topics = topics[topics.param_id.isin(params)]
    if nbtopics and 'nb_topics' in topics.columns:
        topics = topics[topics.nb_topics.isin(nbtopics)]
    topics = (
        topics.drop(
            ['dataset', 'metric', 'param_id', 'nb_topics', 'topic_idx', 'topic_id', 'domain'],
            axis=1, errors='ignore'
        )
        .reset_index(drop=True)
    )
    if print_sample:
        LOG.info('\n' + topics.head(10))
    LOG.info(f'number of topics {len(topics)}')
    return topics


def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument("--topics_file", type=str, required=False)
    parser.add_argument("--labels_file", type=str, required=False)
    parser.add_argument("--d2v_indices", type=str, required=False)
    parser.add_argument("--w2v_indices", type=str, required=False)

    emb_path = EMB_PATH
    parser.add_argument("--d2v_path", type=str, required=False, default=join(emb_path, 'd2v', 'd2v'))
    parser.add_argument("--w2v_path", type=str, required=False, default=join(emb_path, 'w2v', 'w2v'))

    parser.add_argument("--dataset", type=str, required=False)
    parser.add_argument("--nbfiles", type=int, required=False, default=None)
    parser.add_argument("--version", type=str, required=False, default='noun')
    parser.add_argument("--metrics", nargs='*', type=str, required=False, default=['ref'])
    parser.add_argument("--params", nargs='*', type=str, required=False, default=['e42'])
    parser.add_argument("--nbtopics", nargs='*', type=int, required=False, default=[100])

    parser.add_argument("--nblabels", type=int, required=False, default=20)
    parser.add_argument("--max_title_length", type=int, required=False, default=4)
    parser.add_argument("--min_doc_length", type=int, required=False, default=41)

    args = parser.parse_args()

    dataset = DATASETS.get(args.dataset, args.dataset)
    nbfiles_str = f'_nbfiles{args.nbfiles:02d}' if args.nbfiles else ''
    if args.topics_file:
        topics_file_is_given = True
        args.nbfiles = args.version = args.metrics = args.params = args.nbtopics = None
    else:
        topics_file_is_given = False
        args.topics_file = join(
            LDA_PATH, args.version, 'topics',
            f'{dataset}{nbfiles_str}_topic-candidates.csv'
        )
    if args.labels_file is None:
        if topics_file_is_given:
            args.labels_file = args.topics_file.strip('.csv') + '_label-candidates.csv'
        else:
            args.labels_file = join(
                LDA_PATH, args.version, 'topics',
                f'{dataset}{nbfiles_str}_label-candidates.csv'
            )

    if args.d2v_indices and args.w2v_indices:
        args.max_title_length = None
        args.min_doc_length = None

    return (
        args.topics_file, args.labels_file, args.d2v_indices, args.w2v_indices,
        args.d2v_path, args.w2v_path,
        args.metrics, args.params, args.nbtopics,
        args.max_title_length, args.min_doc_length, args.nblabels, args
    )


def fake_args():
    d2v_path = "NETL/model_run/pre_trained_models/doc2vec_en/docvecmodel.d2v"
    w2v_path = "NETL/model_run/pre_trained_models/word2vec_en/word2vec"
    topics_file = "NETL/model_run/data/topics.csv"
    labels_file = "NETL/model_run/output_candidates_topics_gen35_newimp"
    d2v_indices = "NETL/model_run/support_files/en/doc2vec_indices_en"
    w2v_indices = "NETL/model_run/support_files/en/word2vec_indices_en"
    metrics = None
    params = None
    nbtopics = None
    max_title_length = 4
    min_doc_length = 41
    nblabels = 19
    return (
        topics_file, labels_file, d2v_indices, w2v_indices,
        d2v_path, w2v_path,
        metrics, params, nbtopics,
        max_title_length, min_doc_length, nblabels, dict()
    )


def get_word(word):
    if type(word) != str:
        return word
    inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)
    if inst is None:
        return word
    else:
        word = re.sub(r'_\(.+\)', '', word)
        return word


def load_embeddings(d2v_path, w2v_path):
    LOG.info(f'Doc2Vec loading {d2v_path}')
    d2v = Doc2Vec.load(d2v_path)
    LOG.info(f'vocab size: len(d2v.wv.vocab)')
    LOG.info(f'docvecs size: {len(d2v.docvecs.vectors_docs)}')

    LOG.info(f'Word2Vec loading {w2v_path}')
    w2v = Word2Vec.load(w2v_path)
    LOG.info(f'vocab size: {len(w2v.wv.vocab)}')
    return d2v, w2v


def get_phrases(max_title_length, min_doc_length, lemmatized_only=True):
    dewiki_phrases_lemmatized = 'dewiki_phrases_lemmatized.pickle'
    phrases = pd.read_pickle(join(ETL_PATH, dewiki_phrases_lemmatized))
    # creating a list containing original and lemmatized phrases
    phrases = phrases.query(f"doc_len >= {min_doc_length} and title_len <= {max_title_length}")
    if lemmatized_only:
        phrases = phrases.token.unique()
    else:
        phrases = phrases.token.append(phrases.text).unique()
    pat = re.compile(r'^[a-zA-ZÄÖÜäöü]+.*')
    phrases = filter(lambda x: pat.match(x), phrases)
    return phrases

In [23]:
(
    topics_file, labels_file, d2v_indices_file, w2v_indices_file,
    d2v_path, w2v_path,
    metrics, params, nb_topics,
    max_title_length, min_doc_length, nb_labels, args
) = fake_args()

LOG = logger = init_logging(name='label_candidates')
#log_args(logger, args)

topics = load_topics(
    topics_path=topics_file, metrics=metrics, params=params, nbtopics=nb_topics, print_sample=False
)

2018-11-17 01:21:42,665 : INFO : pandas: 0.23.4
2018-11-17 01:21:42,666 : INFO : gensim: 3.5.0
2018-11-17 01:21:42,667 : INFO : python: 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56)  [GCC 7.2.0]
2018-11-17 01:21:42,668 : INFO : Loading topics NETL/model_run/data/topics.csv
2018-11-17 01:21:42,674 : INFO : number of topics 228


In [3]:
d2v, w2v = load_embeddings(
    d2v_path=d2v_path, w2v_path=w2v_path
)

if d2v_indices_file and w2v_indices_file:
    logger.info(f'loading {d2v_indices_file}')
    with open(d2v_indices_file, 'rb') as fp:
        d2v_indices = pickle.load(fp)
    logger.info(f'loading {w2v_indices_file}')
    with open(w2v_indices_file, 'rb') as fp:
        w2v_indices = pickle.load(fp)
else:
    d2v_indices, w2v_indices = get_indices(
        d2v=d2v, w2v=w2v, max_title_length=max_title_length, min_doc_length=min_doc_length
    )
d2v_indices = sorted(set(d2v_indices))
w2v_indices = sorted(set(w2v_indices))

w2v_indexed = index_embeddings(
    d2v=d2v, w2v=w2v, d2v_indices=d2v_indices, w2v_indices=w2v_indices
)

2018-11-17 00:45:06,325 : INFO : pandas: 0.23.4
2018-11-17 00:45:06,326 : INFO : gensim: 3.5.0
2018-11-17 00:45:06,327 : INFO : python: 3.6.5 |Anaconda, Inc.| (default, Apr 29 2018, 16:14:56)  [GCC 7.2.0]
2018-11-17 00:45:06,327 : INFO : Loading topics NETL/model_run/data/topics.csv
2018-11-17 00:45:06,341 : INFO : number of topics 4
2018-11-17 00:45:06,342 : INFO : Doc2Vec loading NETL/model_run/pre_trained_models/doc2vec_en/docvecmodel.d2v
2018-11-17 00:45:06,342 : INFO : loading Doc2Vec object from NETL/model_run/pre_trained_models/doc2vec_en/docvecmodel.d2v
2018-11-17 00:45:06,473 : INFO : Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.
2018-11-17 00:45:06,475 : INFO : loading Doc2Vec object from NETL/model_run/pre_trained_models/doc2vec_en/docvecmodel.d2v
2018-11-17 00:45:16,819 : INFO : loading docvecs recursively from NETL/model_run/pre_trained_models/doc2vec_en/docvecmodel.d2v.docvecs.* with mmap=None
2018-11-17 00:45:16,820 : INFO 

In [24]:
def get_labels(topic, nb_labels, d2v, w2v, w2v_indexed, d_indices, w_indices):
    valdoc2vec = 0.0
    valword2vec = 0.0
    store_indices = []
    topic = topic.iloc[0, :]
    topic_len = len(topic)
    for item in topic:
        try:
            # The word2vec value of topic word from doc2vec trained model
            tempdoc2vec = d2v.wv.syn0norm[d2v.wv.vocab[item].index]
        except:
            pass
        else:
            meandoc2vec = matutils.unitvec(tempdoc2vec).astype(REAL)  # Getting the unit vector
            # The dot product of all labels in doc2vec with the unit vector of topic word
            distsdoc2vec = dot(d2v.docvecs.doctag_syn0norm, meandoc2vec)
            valdoc2vec = valdoc2vec + distsdoc2vec

        try:
            # The word2vec value of topic word from word2vec trained model
            tempword2vec = w2v.wv.syn0norm[w2v.wv.vocab[item].index]
        except:
            pass
        else:
            # Unit vector
            meanword2vec = matutils.unitvec(tempword2vec).astype(REAL)
            # dot product of all possible labels in word2vec vocab with the unit vector of topic word
            distsword2vec = dot(w2v_indexed, meanword2vec)
            """
            This next section of code checks if the topic word is also a potential label in trained 
            word2vec model. If that is the case, it is important the dot product of label with that 
            topic word is not taken into account.Hence we make that zero and further down the code 
            also exclude it in taking average of that label over all topic words. 
            """
            if w2v.wv.vocab[item].index in w_indices:
                i_val = w_indices.index(w2v.wv.vocab[item].index)
                store_indices.append(i_val)
                distsword2vec[i_val] = 0.0
            valword2vec = valword2vec + distsword2vec

    avgdoc2vec = valdoc2vec / float(topic_len)  # Give the average vector over all topic words
    avgword2vec = valword2vec / float(topic_len)  # Average of word2vec vector over all topic words

    # argsort and get top 100 doc2vec label indices
    bestdoc2vec = matutils.argsort(avgdoc2vec, topn=100, reverse=True)
    resultdoc2vec = []
    # Get the doc2vec labels from indices
    for elem in bestdoc2vec:
        ind = d_indices[elem]
        temp = d2v.docvecs.index_to_doctag(ind)
        resultdoc2vec.append((temp, float(avgdoc2vec[elem])))

    # This modifies the average word2vec vector for cases
    # in which the word2vec label was same as topic word.
    for element in store_indices:
        avgword2vec[element] = (avgword2vec[element] * topic_len) / (float(topic_len - 1))

    # argsort and get top 100 word2vec label indices
    bestword2vec = matutils.argsort(avgword2vec, topn=100, reverse=True)
    # Get the word2vec labels from indices
    resultword2vec = []
    for element in bestword2vec:
        ind = w_indices[element]
        temp = w2v.wv.index2word[ind]
        resultword2vec.append((temp, float(avgword2vec[element])))

    # Get the combined set of both doc2vec labels and word2vec labels
    comb_labels = sorted(set([i[0] for i in resultdoc2vec] + [i[0] for i in resultword2vec]))
    newlist_doc2vec = []
    newlist_word2vec = []

    # Get indices from combined labels
    for elem in comb_labels:
        try:
            newlist_doc2vec.append(d_indices.index(d2v.docvecs.doctags[elem].offset))
            temp = get_word(elem)
            newlist_word2vec.append(w_indices.index(w2v.wv.vocab[temp].index))
        except:
            pass
    newlist_doc2vec = set(newlist_doc2vec)
    newlist_word2vec = set(newlist_word2vec)

    # Finally again get the labels from indices. We searched for the score from both d2v and w2v models
    scores_d2v = pd.Series({
        d2v.docvecs.index_to_doctag(d_indices[elem]): float(avgdoc2vec[elem]) for elem in newlist_doc2vec
    }, name='doc2vec')

    scores_w2v = pd.Series({
        w2v.wv.index2word[w_indices[elem]]: float(avgword2vec[elem]) for elem in newlist_word2vec
    }, name='word2vec')
    
    # Finally get the combined score with the label. The label used will be of doc2vec not of word2vec.
    scores = pd.concat([scores_d2v, scores_w2v], axis=1)
    scores['combined'] = scores[['doc2vec', 'word2vec']].sum(axis=1, skipna=False)
    scores_d2v = scores.doc2vec.sort_values(ascending=False).index.values[:nb_labels]
    scores_w2v = scores.word2vec.sort_values(ascending=False).index.values[:nb_labels]
    scores_com = scores.combined.dropna().sort_values(ascending=False).index.values[:nb_labels]
    scores = pd.DataFrame(
        [scores_d2v, scores_w2v, scores_com], 
        index=['doc2vec', 'word2vec', 'combined'],
        columns=[f'label{i}' for i in range(nb_labels)]
    )
    return scores

In [25]:
%%time
labels = topics[:].groupby(level=0).progress_apply(
    lambda row: get_labels(
        topic=row, nb_labels=nb_labels,
        d2v=d2v, w2v=w2v, w2v_indexed=w2v_indexed,
        d_indices=d2v_indices, w_indices=w2v_indices
    )
)

  # Remove the CWD from sys.path while we load stuff.
  if np.issubdtype(vec.dtype, np.int):
  app.launch_new_instance()
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


100%|██████████| 228/228 [1:10:31<00:00, 20.58s/it]

CPU times: user 1h 37min 44s, sys: 18 s, total: 1h 38min 2s
Wall time: 1h 11min 3s





In [26]:
labels

Unnamed: 0,Unnamed: 1,label0,label1,label2,label3,label4,label5,label6,label7,label8,label9,label10,label11,label12,label13,label14,label15,label16,label17,label18
0,doc2vec,microsoft_visual_studio,desktop_virtualization,microsoft_exchange_server,cloud_computing,windows_server_2008,xdb_enterprise_server,windows_home_server,microsoft_access,virtuawin,windows_server_2003,cross-platform,centrify,vmware_service_manager,operating_system,vmware_esx,hyper-v,amazingports,ibm_db2,xen
0,word2vec,software,operating_system,virtualization,middleware,cloud_computing,linux,windows_server,microsoft_sql_server,hyper-v,sharepoint,application_software,web_application,application_server,user_interface,vmware,application_programming_interface,active_directory,virtual_machine,scalability
0,combined,cloud_computing,operating_system,hyper-v,windows_server_2003,sharepoint,application_server,windows_server_2008,vmware,virtualization,microsoft_sql_server,windows_2000,microsoft_exchange_server,hypervisor,web_application,sun_microsystems,desktop_virtualization,netware,oracle_database,postgresql
1,doc2vec,"massachusetts_gubernatorial_election,_1994","washington_gubernatorial_election,_2000",conservative_democrat,"washington_gubernatorial_election,_1984",joe_lieberman,"texas_senate,_district_15","washington_gubernatorial_election,_1980",democratic_party_(united_states),"missouri_republican_primary,_2004","washington_gubernatorial_election,_1988","texas_gubernatorial_election,_1994",ted_cruz,republican_party_presidential_primaries,presidency_of_barack_obama,"illinois_gubernatorial_election,_1834","washington_gubernatorial_election,_1992",republican_party_(united_states),democratic_leadership_council,"texas_senate,_district_16"
1,word2vec,barack_obama,presidential_nominee,john_mccain,john_kerry,hillary_clinton,george_w._bush,campaign_manager,joe_biden,mitt_romney,bill_clinton,republican_party,candidacy,bob_dole,running_mate,joe_lieberman,walter_mondale,george_h._w._bush,democratic_national_committee,state_senator
1,combined,john_mccain,joe_lieberman,joe_biden,barack_obama,bill_clinton,john_kerry,george_w._bush,hillary_clinton,presidential_nominee,mitt_romney,walter_mondale,conservative_democrat,bob_dole,george_h._w._bush,ted_cruz,howard_dean,al_gore,dennis_kucinich,lindsey_graham
2,doc2vec,windows_mobile,windows_home_server,virtuawin,microsoft_office_live,bitspirit,windows_xp_editions,pdf_studio,microsoft_windows,cross-platform,kodi_(software),windows_10,microsoft_visual_studio,synovel_collabsuite,operating_system,pgpdisk,windows_8.1,batchsync,easy2sync_for_files,red5_(media_server)
2,word2vec,operating_system,user_interface,linux,web_browser,windows_xp,windows_vista,software,microsoft_windows,windows_2000,windows_7,graphical_user_interface,application_software,application_programming_interface,firmware,windows_8,source_code,web_application,client-side,internet_explorer
2,combined,operating_system,microsoft_windows,linux,windows_2000,windows_xp,windows_vista,windows_7,windows_8,microsoft_office,windows_10,web_browser,windows_mobile,windows_server_2003,internet_explorer,application_programming_interface,windows_server_2008,windows_8.1,microsoft_visual_studio,os_x
3,doc2vec,flash_memory,mac_mini,solid-state_drive,asus_eee_pc,usb_flash_drive,compactflash,commodore_64_peripherals,usb,secure_digital,tandy_1000,apple_ii_processor_cards,dell_xps,history_of_personal_computers,lapfit,quiet_pc,bios,sord_is-11,personal_computer,expansion_card


In [27]:
logger.info(f'Writing labels to {labels_file}')
labels.to_csv(labels_file + '.csv')

2018-11-17 02:34:00,734 : INFO : Writing labels to NETL/model_run/output_candidates_topics_gen35_newimp
