In [1]:
"""
Author:         Shraey Bhatia
Date:           October 2016
File:           get_indices.py


This file takes in the output of pruned_documents.py and word2vec_phrases.py and give back the
respective indices from doc2vec model and word2vec model
respectively. You can also download these output files from URLs in readme.(Word2vec phrase List
and Filtered/short document titles). Though all these files have already been given to run the
models but script is given if you want to create your own. These indices fileswill be used in
cand-generation.py to generate label candidates
"""

# TODO: changed
#from gensim.models.deprecated.doc2vec import Doc2Vec
#from gensim.models.deprecated.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec
from gensim.models.word2vec import Word2Vec
import re
import pickle

In [2]:
# Global Parameters EN
doc2vec_model = "model_run/pre_trained_models/doc2vec_en/docvecmodel.d2v"  # Trained Doc2vec Model
word2vec_model = "model_run/pre_trained_models/word2vec_en/word2vec"  # Trained word2vec model
short_label_documents = "additional_support_files/short_label_documents_en"  # The file created by pruned_documents.py. FIltering short or long title documents.
short_label_word2vec_tokenised = "training/additional_files/word2vec_phrases_list_tokenized_en.txt"  # The file created by word2vec_phrases.py Removing brackets from filtered wiki titles.
doc2vec_indices_output = "doc2vec_indices_en"  # The output file which map pruned doc2vec labels to indcies from doc2vec model.
word2vec_indices_output = "word2vec_indices_en"  # the output file that maps short_label_word2vec_tokenised to indices from wrod2vec model.

In [2]:
# Global Parameters DE
doc2vec_model = "model_run/pre_trained_models/d2v_de_gen35/d2v"  # Trained Doc2vec Model
word2vec_model = "model_run/pre_trained_models/w2v_de_gen35/w2v"  # Trained word2vec model
short_label_documents = "additional_support_files/short_label_documents_de_lemmatized"  # The file created by pruned_documents.py. FIltering short or long title documents.
short_label_word2vec_tokenised = "training/additional_files/word2vec_phrases_list_tokenized_de.txt"  # The file created by word2vec_phrases.py Removing brackets from filtered wiki titles.
doc2vec_indices_output = "model_run/pre_trained_models/d2v_de_gen35/d2v_indices_de_gen35"  # The output file which map pruned doc2vec labels to indcies from doc2vec model.
word2vec_indices_output = "model_run/pre_trained_models/w2v_de_gen35/w2v_indices_de_gen35"  # the output file that maps short_label_word2vec_tokenised to indices from wrod2vec model.

In [24]:
from gensim.models.callbacks import CallbackAny2Vec

class EpochLogger(CallbackAny2Vec):
    """Callback to log information about training"""
    def __init__(self):
        self.epoch = 1

    def on_epoch_begin(self, model):
        print("Epoch #{:02d} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{:02d} end".format(self.epoch))
        self.epoch += 1

class EpochSaver(CallbackAny2Vec):
    """Callback to save model after each epoch."""
    def __init__(self, model_name, directory, checkpoint_every=5):
        self.model_name = model_name
        self.directory = join(directory, 'checkpoints')
        if not exists(self.directory):
            makedirs(self.directory)
        self.epoch = 1
        self.checkpoint_every = checkpoint_every

    def on_epoch_end(self, model):
        if self.epoch % self.checkpoint_every == 0:
            file = '{}_epoch{:02d}'.format(self.model_name, self.epoch)
            filepath = join(self.directory, file)
            print('Saving checkpoint to ' + filepath)
            model.save(filepath)
        self.epoch += 1


In [26]:
# Removing any junk labels and also if a label pops up with the term disambiguation.
def get_word(word):
    inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)

    if inst is None:
        length = len(word.split("_"))
        if length < 5:
            return True, word
    else:
        if inst.group(1) != "disambiguation":
            word2 = re.sub(r'_\(.+\)', '', word)
            if len(word2.split(" ")) < 5:
                return True, word

    return False, word


# Load the trained doc2vec and word2vec models.
d2v = Doc2Vec.load(doc2vec_model)
w2v = Word2Vec.load(word2vec_model)
print("Models loaded")

Models loaded


In [7]:
# Loading the pruned tiles and making a set of it
with open(short_label_documents, "rb") as k:
    doc_labels = pickle.load(k, encoding='latin1')
doc_labels = set(doc_labels)
print("Pruned document titles loaded")

Pruned document titles loaded


In [10]:
with open(short_label_word2vec_tokenised, 'a') as fp:
    for term in doc_labels:
        fp.write(term + '\n')


In [9]:
sorted(doc_labels)

['A',
 'A&E_Design',
 'A&E_Germany',
 'A&E_Network',
 'A&M_Record',
 'A&O_Hotel_and_Hostel',
 'A&P_Group',
 'A&W',
 'A&W_Architektur_&_Wohnen',
 "A'-anepada",
 "A'ala_Hubail",
 'A*-Algorithmus',
 'A*M*E',
 'A*P*E',
 'A*Teens',
 'A+',
 'A+E_Network',
 'A-0',
 'A-101',
 'A-102',
 'A-103',
 'A-104',
 'A-105',
 'A-135_Abm-System',
 'A-1_Pictur',
 'A-222',
 'A-38',
 'A-68',
 'A-90_Orljonok',
 'A-A-P',
 'A-Band',
 'A-Blogger',
 'A-Boot',
 'A-DNA',
 'A-Division',
 'A-Division-Liga',
 'A-Dur',
 'A-Endopsychosin',
 'A-Fonds-Perdu-Beitrag',
 'A-Frame',
 'A-Gruppe',
 'A-Hmao',
 'A-Jugend-Bundesliga',
 'A-Jugend_Handball-Bundesliga_2011/12',
 'A-Jugend_Handball-Bundesliga_2012/13',
 'A-Jugend_Handball-Bundesliga_2013/14',
 'A-Jugend_Handball-Bundesliga_2014/15',
 'A-Jugend_Handball-Bundesliga_2015/16',
 'A-Jugend_Handball-Bundesliga_2016/17',
 'A-Junioren-Bundesliga',
 'A-Junioren-Bundesliga_2003/04',
 'A-Junioren-Bundesliga_2004/05',
 'A-Junioren-Bundesliga_2005/06',
 'A-Junioren-Bundesliga_2006/

In [11]:
# laoding thw phrasses used in training word2vec model. And then replacing space with underscore.
h = open(short_label_word2vec_tokenised, 'r')
list_labels = []
for line in h:
    line = line.strip()
    list_labels.append(line)
list_labels = set(list_labels)

In [12]:
sorted(list_labels)

['A',
 'A&E_Design',
 'A&E_Germany',
 'A&E_Network',
 'A&M_Record',
 'A&O_Hotel_and_Hostel',
 'A&P_Group',
 'A&W',
 'A&W_Architektur_&_Wohnen',
 "A'-anepada",
 "A'ala_Hubail",
 'A*-Algorithmus',
 'A*M*E',
 'A*P*E',
 'A*Teens',
 'A+',
 'A+E_Network',
 'A-0',
 'A-101',
 'A-102',
 'A-103',
 'A-104',
 'A-105',
 'A-135_Abm-System',
 'A-1_Pictur',
 'A-222',
 'A-38',
 'A-68',
 'A-90_Orljonok',
 'A-A-P',
 'A-Band',
 'A-Blogger',
 'A-Boot',
 'A-DNA',
 'A-Division',
 'A-Division-Liga',
 'A-Dur',
 'A-Endopsychosin',
 'A-Fonds-Perdu-Beitrag',
 'A-Frame',
 'A-Gruppe',
 'A-Hmao',
 'A-Jugend-Bundesliga',
 'A-Jugend_Handball-Bundesliga_2011/12',
 'A-Jugend_Handball-Bundesliga_2012/13',
 'A-Jugend_Handball-Bundesliga_2013/14',
 'A-Jugend_Handball-Bundesliga_2014/15',
 'A-Jugend_Handball-Bundesliga_2015/16',
 'A-Jugend_Handball-Bundesliga_2016/17',
 'A-Junioren-Bundesliga',
 'A-Junioren-Bundesliga_2003/04',
 'A-Junioren-Bundesliga_2004/05',
 'A-Junioren-Bundesliga_2005/06',
 'A-Junioren-Bundesliga_2006/

In [13]:
word2vec_labels = []
for words in list_labels:
    new = words.split(" ")
    temp = '_'.join(new)
    word2vec_labels.append(temp)
word2vec_labels = set(word2vec_labels)
print("Word2vec model phrases loaded")
sorted(word2vec_labels)

Word2vec model phrases loaded


['A',
 'A&E_Design',
 'A&E_Germany',
 'A&E_Network',
 'A&M_Record',
 'A&O_Hotel_and_Hostel',
 'A&P_Group',
 'A&W',
 'A&W_Architektur_&_Wohnen',
 "A'-anepada",
 "A'ala_Hubail",
 'A*-Algorithmus',
 'A*M*E',
 'A*P*E',
 'A*Teens',
 'A+',
 'A+E_Network',
 'A-0',
 'A-101',
 'A-102',
 'A-103',
 'A-104',
 'A-105',
 'A-135_Abm-System',
 'A-1_Pictur',
 'A-222',
 'A-38',
 'A-68',
 'A-90_Orljonok',
 'A-A-P',
 'A-Band',
 'A-Blogger',
 'A-Boot',
 'A-DNA',
 'A-Division',
 'A-Division-Liga',
 'A-Dur',
 'A-Endopsychosin',
 'A-Fonds-Perdu-Beitrag',
 'A-Frame',
 'A-Gruppe',
 'A-Hmao',
 'A-Jugend-Bundesliga',
 'A-Jugend_Handball-Bundesliga_2011/12',
 'A-Jugend_Handball-Bundesliga_2012/13',
 'A-Jugend_Handball-Bundesliga_2013/14',
 'A-Jugend_Handball-Bundesliga_2014/15',
 'A-Jugend_Handball-Bundesliga_2015/16',
 'A-Jugend_Handball-Bundesliga_2016/17',
 'A-Junioren-Bundesliga',
 'A-Junioren-Bundesliga_2003/04',
 'A-Junioren-Bundesliga_2004/05',
 'A-Junioren-Bundesliga_2005/06',
 'A-Junioren-Bundesliga_2006/

In [14]:
doc_indices = []
word_indices = []

# finds the coresponding index of the title from doc2vec model
for elem in doc_labels:
    status, item = get_word(elem)
    if status:
        try:
            val = d2v.docvecs.doctags[elem].offset
            doc_indices.append(val)
        except:
            pass

# Finds the corseponding index from word2vec model
for elem in word2vec_labels:
    try:
        val = w2v.wv.vocab[elem].index
        word_indices.append(val)
    except:
        pass

In [15]:
doc_indices

[683257,
 241107,
 1916926,
 50029,
 3118151,
 441360,
 1357706,
 2675749,
 3069313,
 504944,
 3516376,
 786159,
 2903137,
 2571959,
 600371,
 3705857,
 503809,
 701579,
 223306,
 2615443,
 3423201,
 3181976,
 1659299,
 1751917,
 2462255,
 2639916,
 841480,
 1912416,
 1718808,
 3279756,
 1984014,
 3141688,
 2114307,
 1324656,
 1307202,
 511500,
 1729098,
 1907007,
 2823737,
 2328722,
 3362957,
 847387,
 1832483,
 3854218,
 2172498,
 1066489,
 3158532,
 1599962,
 1678312,
 1927240,
 1039594,
 1733283,
 90328,
 388682,
 3862979,
 2186519,
 1981954,
 477718,
 353446,
 213921,
 499040,
 3448977,
 614820,
 447204,
 2473684,
 303990,
 1163100,
 2151938,
 3279473,
 1303799,
 1797451,
 566655,
 3781485,
 1922305,
 3115481,
 358617,
 764799,
 2171481,
 239383,
 1885046,
 3190944,
 3708653,
 2379701,
 2256677,
 274,
 1505061,
 1590446,
 3433708,
 236393,
 299420,
 449593,
 3657073,
 3252833,
 1863480,
 3031843,
 402481,
 258086,
 3669762,
 1175766,
 2957848,
 2901362,
 1101228,
 647931,
 1615610

In [17]:
word_indices

[338927,
 80206,
 578675,
 322006,
 331667,
 476271,
 271715,
 56176,
 392071,
 683026,
 537286,
 112835,
 508831,
 20130,
 82798,
 426010,
 419514,
 284872,
 149241,
 668774,
 92967,
 96587,
 702667,
 597670,
 310767,
 701671,
 579988,
 615855,
 637794,
 161968,
 208912,
 170997,
 13957,
 168624,
 59654,
 594860,
 222207,
 293297,
 482581,
 148268,
 536689,
 417610,
 561126,
 7653,
 620711,
 234112,
 183976,
 43100,
 688711,
 30410,
 31610,
 68583,
 194988,
 327868,
 701454,
 87364,
 401465,
 221902,
 253176,
 479187,
 541809,
 338336,
 125390,
 145894,
 242811,
 695276,
 121599,
 598292,
 289722,
 383887,
 588622,
 592672,
 52316,
 588414,
 382649,
 189406,
 682531,
 207750,
 386295,
 274196,
 181246,
 611149,
 681862,
 67415,
 528433,
 111682,
 74578,
 478802,
 226555,
 15773,
 515840,
 589530,
 629687,
 428780,
 600759,
 653534,
 76463,
 13379,
 666182,
 257213,
 416247,
 669220,
 36041,
 95684,
 320257,
 60779,
 389522,
 139524,
 86680,
 197889,
 223431,
 124322,
 403485,
 717853,

In [18]:
# creating output indices file
with open(doc2vec_indices_output, 'wb') as m:
    pickle.dump(doc_indices, m)
with open(word2vec_indices_output, 'wb') as n:
    pickle.dump(word_indices, n)

In [3]:
with open(doc2vec_indices_output, 'rb') as m:
    doc_indices = pickle.load(m)
with open(word2vec_indices_output, 'rb') as n:
    word_indices = pickle.load(n)

In [8]:
with open('../d2v_indices', 'rb') as m:
    doc_indices2 = pickle.load(m)
with open('../w2v_indices', 'rb') as n:
    word_indices2 = pickle.load(n)

In [15]:
len(doc_indices), sorted(doc_indices)[:10]

(1525493, [1, 3, 5, 8, 10, 12, 14, 17, 20, 26])

In [17]:
len(doc_indices2), sorted(doc_indices2)[:10]

(1524230, [1, 3, 5, 8, 10, 12, 14, 17, 20, 26])

In [16]:
len(word_indices), sorted(word_indices)[:10]

(307478, [1, 2, 3, 4, 5, 6, 9, 11, 14, 16])

In [18]:
len(word_indices2), sorted(word_indices2)[:10]

(307344, [1, 2, 3, 4, 5, 6, 9, 11, 14, 16])

In [13]:
import numpy as np
d1 = np.asarray(doc_indices)
d2 = np.asarray(doc_indices2)
w1 = np.asarray(word_indices)
w2 = np.asarray(word_indices2)

In [36]:
# all arrays are unique
d2v_diff = np.setdiff1d(d1, d2)
w2v_diff = np.setdiff1d(w1, w2)

In [44]:
diff_phrases = []
for offset in d2v_diff:
    diff_phrase = d2v.docvecs.offset2doctag[offset]
    diff_phrases.append(diff_phrase)
    print(diff_phrase)

DB
Knospung
Bezirk_Uster
Suracon
Linear-Antiqua
ET
Lokal_konstant_Funktion
Meg
Parainfektiöse_Erkrankung
Bezirk_Güssing
Nazca-Ebene
EDC
Trento
Sextius_Naso
UTC−2
Zeitverschiebung
Bindorf
UTC+13:45
Gebäude_mittel_Höhe
Flache
Unicodeblock_Javanisch
Friedrich_Magnu_zu_Solms-Wildenfels
Reinhold_Batberger
Josefa
Brudslöjan
Psalterion
Titularbistum_Algiza
Titularbistum_Eriza
liste_Norder_Persönlichkeit
Arteria_thoracoacromialis
Numisnautik
Titularbistum_Dragonara
NGC_469
NGC_479
Titularbistum_Aufinium
Ingrid_Kötter
Liste_bekannt_Illustrator
Hoare
Mnemonic
Liste_altkirchenslawisch_Schriftsteller
Bistum_Baton_Rouge
Ski_Nordisch
Sci
Zeuxo
Iolkos
North_Star
Bistum_Sées
Ramessidenzeit
M15
weiß_Stil
Universität_Ermland-Masur
Flurbereinigungsgesetz
NGC_1866
Kwch-Sendemast
Titularbistum_Pesto
Titularbistum_Minturnae
Speiseeishersteller
Rajon_Mszislau
Kanton_Saint-Paul-2
Titularbistum_Aeca
Ye_sacred_mus
Taishu
Autoestrada_A6
NGC_512
Anželika_Ahmetšina
franzen_Karl
Wienberg
Titularbistum_Aquaviva
Nati

In [38]:
for idx in w2v_diff:
    print(w2v.wv.index2word[idx])

ET
Kongresswahl
DB
GR
VfB
Absage
April_1918
Neil
MSc
Feuerkraft
Schimanski
Rostrum
Kurienkardinal
Farina
Tanaka
Breton
Marjorie
Meg
Nordmann
Winfried
Josefa
Felice
Bittner
Lateiner
Joseph_Anton
GRU
Roß
franzen_Karl
Shaun
Lowry
Nah
Alang
Pal
North_Star
PG
Invarianz
Trento
Flache
Mehdi
Nadler
Knospung
ADH
Hoare
Kabila
Drury
Dutton
Cate
Zeitverschiebung
EDP
Böck
Craddock
Basisjahr
Bassi
Sci
Kenney
Bezirk_Güssing
EDC
Junqueira
Ramessidenzeit
Iolkos
Spiel_der_Frankophonie
DAM
Nadolny
BNA
Dünkel
Johann_Friedrich_von_Sachs
Surp
Rupfen
Wahlmännerkollegium
Cerf
Kekulé_von_Stradonitz
Prata
Doon
Icke
Higashi
Deanna
Beeston
Strathcona
Wilt
DAR
Barris
Kasuga
Lucanus
PAZ
Homem
Ski_Nordisch
M15
Arzu
Route_25
Revolutionsgericht
Eyvind
Ortsbach
Bezirk_Uster
Landvoigt
golden_Biene
groß_Preis_von_Katalonien
Antiques
Maxence
Henri_de_la_Tour
Basu
Linear-Antiqua
Aignan
Tickhill
Rasi
Mirambeau
Max_Baumann
Vernouillet
Glasgow_Cup
Pupin
Uselding
Galego
Route_41
Tegmen
Kedar
Flurbereinigungsgesetz
Wärmekeimer


In [43]:
import pandas as pd
from os.path import join
from constants import ETL_PATH
dewiki_phrases_lemmatized = 'dewiki_phrases_lemmatized.pickle'
phrases = pd.read_pickle(join('..', ETL_PATH, dewiki_phrases_lemmatized))
phrases

Unnamed: 0_level_0,sent_idx,text,token,doc_len,title_len
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8952056961092092653,1,Alan_Smithee,Alan_Smithee,747,2
598046625986755870,44,Actinium,Actinium,866,1
8442369265370766621,98,Ang_Lee,ANG_Lee,1632,2
-5325279570187525080,181,Anschluss,Anschluss,413,1
5107548614255273253,199,Aussagenlogik,Aussagenlogik,7211,1
-6810310479569543740,599,Anthony_Minghella,Anthony_Minghella,637,2
-291419119128528545,636,US-amerikanischer_Film,us-amerikanisch_Film,2992,2
-686601136003585762,767,Vorsätze_für_Maßeinheiten,Vorsatz_für_Maßeinheit,992,3
-928617659304474122,829,Abkürzungen/Gesetze_und_Recht,Abkürzungen/Gesetz_und_Recht,167,3
-4698193686953049209,842,Liste_von_Unternehmen_mit_Namensherkunftserklä...,Liste_von_Unternehmen_mit_Namensherkunftserklä...,42,5


In [45]:
phrases[phrases.token.isin(diff_phrases)]

Unnamed: 0_level_0,sent_idx,text,token,doc_len,title_len
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7446122115458236883,91691,DB,DB,40,1
4965651893358388020,658394,Knospung,Knospung,40,1
4696879948454559846,1280066,Bezirk_Uster,Bezirk_Uster,40,2
-4216224223148265736,1583290,Suracon,Suracon,40,1
-9048151810841266133,1701937,Linear-Antiqua,Linear-Antiqua,40,1
4190235603074552992,1840026,ET,ET,40,1
5633238256754645611,2024959,Lokal_konstante_Funktion,Lokal_konstant_Funktion,40,3
8611299304512053717,2218783,Meg,Meg,40,1
-4401550258778133427,2312981,Parainfektiöse_Erkrankung,Parainfektiöse_Erkrankung,40,2
-8041491462077928033,2690376,Bezirk_Güssing,Bezirk_Güssing,40,2
