-----

## Transforming Word Similarity / Association datasets

In [1]:
%matplotlib inline
import pickle
from os import listdir, makedirs
from os.path import join, isfile, exists, dirname
import gc
import re
import math

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns
%pylab inline
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel, LdaModel
from gensim.models import Word2Vec, Doc2Vec, FastText
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from tqdm._tqdm_notebook import tqdm_notebook

from constants import *
from utils import load, init_logging, tprint, TopicsLoader
from topic_reranking import Reranker

#from eval_lda import eval_coherence

tqdm_notebook.pandas()
pd.options.display.max_columns = 30
pd.options.display.max_rows = 2000

Populating the interactive namespace from numpy and matplotlib


In [165]:
# Gurevych datasets

names = ['Word1', 'Word2', 'Gold', 'Pos1', 'Pos2']
def gur(size):
    return (
        pd
        .read_csv(join(DATA_BASE, 'gurevych_datasets', f'wortpaare{size}.gold.pos.txt'), sep=':', names=names, comment='#')
        .assign(Dataset=f'gur{size}')
        .set_index('Dataset', append=True)
    )

df = pd.concat([gur(350), gur(222), gur(65)])
df.to_csv(join(ETL_PATH, 'gurevych_datasets.csv'))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Word1,Word2,Gold,Pos1,Pos2
Unnamed: 0_level_1,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,gur350,Absage,ablehnen,3.5,n,v
1,gur350,Absage,Stellenanzeige,1.88,n,n
2,gur350,Affe,Gepäckkontrolle,0.13,n,n
3,gur350,Affe,Makake,4.0,n,n
4,gur350,Afrika,historisch,1.0,n,a
5,gur350,Agentur,Irrtum,0.0,n,n
6,gur350,Airbag,Kopfairbag,3.88,n,n
7,gur350,analysieren,Analyse,3.88,v,n
8,gur350,Ansehen,Schaden,0.88,n,n
9,gur350,Arbeitssuchender,Bewerbung,2.75,n,n


In [14]:
# SimLex999 German Dataset

file_en = join(DATA_BASE, 'SimLex-999', 'SimLex-999.txt')
df_en = pd.read_csv(file_en, sep='\t').assign(dataset=f'SimLex999_en')
file_de = join(DATA_BASE, 'SimLex_ALL_Langs_TXT_Format', 'MSimLex999_German.txt')
df = pd.read_csv(file_de, sep=',').assign(dataset=f'SimLex999_de')
df['POS'] = df_en.POS
df['avg'] = df.loc[:, '1':'13'].mean(axis=1)
df['sd'] = df.loc[:, '1':'13'].std(axis=1)
df = df.reset_index().rename(columns={'index': 'topic_idx'}).set_index(['dataset', 'topic_idx'])

#df.to_csv(join(ETL_PATH, 'simlex999.csv'))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Word1,Word2,1,2,3,4,5,6,7,8,9,10,11,12,13,Average Score,POS,avg,sd
dataset,topic_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SimLex999_de,0,alt,neu,0,0,0,0,0,0,0,1,0,0,0,0,0,0.08,A,0.077,0.277
SimLex999_de,1,klug,intelligent,10,9,9,10,10,9,10,10,10,10,10,9,10,9.69,A,9.692,0.48
SimLex999_de,2,schwer,schwierig,9,9,10,10,6,8,10,8,10,10,10,10,10,9.23,A,9.231,1.235
SimLex999_de,3,glücklich,fröhlich,10,8,10,10,9,8,10,9,10,9,10,10,10,9.46,A,9.462,0.776
SimLex999_de,4,schwer,leicht,0,0,1,0,0,0,0,1,0,0,0,0,0,0.15,A,0.154,0.376
SimLex999_de,5,schnell,rasant,10,9,9,10,10,8,10,9,9,7,10,9,10,9.23,A,9.231,0.927
SimLex999_de,6,glücklich,freudig,10,8,10,10,8,6,10,9,10,8,10,10,10,9.15,A,9.154,1.281
SimLex999_de,7,kurz,lang,0,0,1,0,0,0,0,1,0,0,0,1,0,0.23,A,0.231,0.439
SimLex999_de,8,dumm,blöd,10,9,9,9,10,9,10,9,8,10,10,10,10,9.46,A,9.462,0.66
SimLex999_de,9,sonderbar,eigentümlich,8,9,9,9,10,7,10,8,5,7,10,6,10,8.31,A,8.308,1.653


In [41]:
# WS353 German Dataset

file = join(DATA_BASE, 'WS353_All_Langs_TXT_Format', 'MWS353_German.txt')
f_sim = join(DATA_BASE, 'WS353_ALL_Langs_SIM_TXT_Format', 'WS353-german-sim.txt')
f_rel = join(DATA_BASE, 'WS353_ALL_Langs_REL_TXT_Format', 'WS353-german-rel.txt')

df = pd.read_csv(file, sep=',').assign(dataset=f'WS353_de')
df_sim = pd.read_csv(f_sim, sep=',').assign(dataset=f'WS353_sim')
df_rel = pd.read_csv(f_rel, sep=',').assign(dataset=f'WS353_rel')

df = df.join(df_sim.set_index(['Word1', 'Word2', 'Average_Score']), on=['Word1', 'Word2', 'Average Score'], how='outer', rsuffix='_sim')
df = df.join(df_rel.set_index(['Word1', 'Word2', 'Average_Score']), on=['Word1', 'Word2', 'Average Score'], how='outer', rsuffix='_rel')
df[['dataset_sim', 'dataset_rel']] = df[['dataset_sim', 'dataset_rel']].applymap(lambda x: '' if isinstance(x, float) else x)

df['avg'] = df.loc[:, '1':'13'].mean(axis=1)
df['sd'] = df.loc[:, '1':'13'].std(axis=1)
df = df.reset_index().rename(columns={'index': 'topic_idx'}).set_index(['dataset', 'topic_idx'])

#df.to_csv(join(ETL_PATH, 'ws353.csv'))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Word1,Word2,1,2,3,4,5,6,7,8,9,10,11,12,13,Average Score,dataset_sim,dataset_rel,avg,sd
dataset,topic_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
WS353_de,0,Liebe,Sex,8,9,8,9,9,8.0,9.0,9.0,7.0,10.0,7.0,10,7.0,8.46,,WS353_rel,8.462,1.05
WS353_de,1,Tiger,Katze,8,7,7,8,7,8.0,9.0,10.0,9.0,8.0,7.0,8,7.0,7.92,WS353_sim,,7.923,0.954
WS353_de,2,Tiger,Tiger,10,10,10,10,10,10.0,10.0,10.0,10.0,10.0,10.0,10,10.0,10.0,WS353_sim,,10.0,0.0
WS353_de,3,Buch,Papier,9,8,8,4,8,8.0,9.0,10.0,9.0,8.0,0.0,5,6.0,7.08,,WS353_rel,7.077,2.722
WS353_de,4,Computer,Tastatur,8,7,8,7,9,8.0,9.0,10.0,8.0,10.0,6.0,6,8.0,8.0,,WS353_rel,8.0,1.291
WS353_de,5,Computer,Internet,8,7,9,7,7,9.0,9.0,10.0,9.0,10.0,8.0,7,5.0,8.08,,WS353_rel,8.077,1.441
WS353_de,6,Flugzeug,Auto,7,0,7,0,6,7.0,2.0,9.0,8.0,5.0,5.0,0,8.0,4.92,WS353_sim,,4.923,3.303
WS353_de,7,Zug,Auto,7,0,7,0,5,7.0,7.0,9.0,8.0,8.0,5.0,2,7.0,5.54,WS353_sim,,5.538,3.017
WS353_de,8,Telefon,Kommunikation,10,8,8,6,8,7.0,9.0,10.0,9.0,10.0,8.0,8,8.0,8.38,,WS353_rel,8.385,1.193
WS353_de,9,Fernseher,Radio,8,0,8,0,8,5.0,6.0,10.0,8.0,5.0,5.0,5,7.0,5.77,WS353_sim,,5.769,3.004


In [44]:
file = join(ETL_PATH, 'simlex999.csv')
df = pd.read_csv(file, header=0, index_col=[0, 1])
df[['Word1', 'Word2']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Word1,Word2
dataset,topic_idx,Unnamed: 2_level_1,Unnamed: 3_level_1
SimLex999_de,0,alt,neu
SimLex999_de,1,klug,intelligent
SimLex999_de,2,schwer,schwierig
SimLex999_de,3,glücklich,fröhlich
SimLex999_de,4,schwer,leicht
SimLex999_de,5,schnell,rasant
SimLex999_de,6,glücklich,freudig
SimLex999_de,7,kurz,lang
SimLex999_de,8,dumm,blöd
SimLex999_de,9,sonderbar,eigentümlich


-----

### correcting datasets / applying manual Wiki-lemmatization

In [55]:
dic = load('dewiki', 'dict')
dic.token2id['Wahlfach']

Loading dict from ../data/preprocessed/LDAmodel/noun/bow/dewiki_noun_bow.dict


91802

In [9]:
lemmap = load('dewiki', 'lemmap')
lemmap

fLoading {file}


token
+10                                                                                                                     +10
+φ                                                                                                                       +φ
0+0                                                                                                                     0+0
0,0022″                                                                                                             0,0022″
0,01–0,1                                                                                                           0,01–0,1
0,1)-Form                                                                                                         0,1)-Form
0,1-Megapixel-Kamera                                                                                   0,1-Megapixel-Kamera
0,1-Schritt                                                                                                   0,1-Schritten
0,

In [49]:
# simlex
missing = ['Agression', 'Blase', 'Büderlichkeit', 'Decke', 'Dunstschleier', 'Ecke', 'Frieden', 'Gefährte', 'Gegener', 'Gerassel', 'Haufen', 'Haushelferin', 'Herde', 'Kleider', 'Kohle', 'Menschen', 'Männer', 'Nachrichten', 'Reise', 'Samen', 'Schaden', 'Zähne']
# ws
missing = ['Augen', 'Drogen', 'Frieden', 'Kleider', 'Kohle', 'Medien', 'Menschen', 'Nachrichten', 'Ohren', 'Präzedensfall', 'Reise', 'Runde', 'Truppen', 'abheben', 'beispielhaft', 'besiegen', 'ficken', 'klug', 'kämpfen', 'live', 'palestinensisch', 'säugen', 'trinken', 'virtuos', 'vorangehend', 'vorausgehend', 'vorbildlich']
# gut
missing = ['Aktensperrfrist', 'Arbeitssuchender', 'Baumaschinenmeister', 'Beamte', 'Betrugshandlung', 'Böse', 'Büroequipment', 'Detailkonstrukteurinnen', 'Endlast', 'Fachbereichsvertreter', 'Flächeneinsparung', 'Frieden', 'Frontenbildung', 'Frühlingssonne', 'Garnerzeugung', 'Gastlandkontakte', 'Geschirrdurcheinander', 'Gleitkomma', 'Hirnsignal', 'Körpernorm', 'Langzeittherapieprogramm', 'Linkstatistik', 'Makake', 'Managerinnen', 'Metadatenwerkzeug', 'Migrantinnen', 'Monate', 'Neoautoritarismus', 'Neuronenaktivität', 'Organisationskompetenz', 'Personaldisposition', 'Portfolioanalyse', 'Premium-Hersteller', 'Probenehmer', 'Quartalsumfrage', 'Quelle', 'Reise', 'Reiseschutzpass', 'Restaurierungsmethode', 'Samenzahnrad', 'Sandwich-Konzept', 'Schaden', 'Soziales', 'Spitze', 'Sports-Tourer', 'Studierende', 'TV-Kamera', 'Tiefbaubauingenieur', 'Ursprugsort', 'Verkäuferinnen', 'Vertriebstechniker', 'Vliesstofferzeugung', 'Volierenzelt', 'Wahlfächer', 'Warte', 'Weiblich']
for word in missing:
    print(word, '->')
    lem = lemmap[lemmap == word].index.values
    print(lem)
    if len(lem) == 0:
        print(list(lemmap[lemmap.str.startswith(word[:-2])].index.values))
    try:
        print(lemmap[word])
    except KeyError as e:
        print('error', e)
        
    print()

Aktensperrfrist ->
[]
[]
error 'Aktensperrfrist'

Arbeitssuchender ->
[]
['Arbeitssuchende']
error 'Arbeitssuchender'

Baumaschinenmeister ->
[]
[]
error 'Baumaschinenmeister'

Beamte ->
[]
['Beam', 'Beam-Außenlaststation', 'Beam-Familie', 'Beam-Power-Tetrode', 'Beam-Scanning', 'Beam-Suntory-Konzern', 'Beam-Verlag', 'Beaman', 'Beaman-Gletscher', 'Beamaufnahme', 'Beambten', 'Beamdog', 'Beamdogs', 'Beame', 'Beamen', 'Beamer', 'Beamer-Leinwand', 'Beamer-Präsentation', 'Beamersystem', 'Beamfleot', 'Beamformer', 'Beamforming', 'Beaming', 'Beaminster', 'Beamish', 'Beamline', 'Beamlines', 'Beamoa', 'Beamon', 'Beamons', 'Beamont', 'Beamor', 'Beamptung', 'Beams', 'Beamshow', 'Beamsville', 'BeamtStG', 'BeamtStG.', 'BeamtVG', 'Beamte/Angestellte', 'Beamten-Ausbildung', 'Beamten-Bau-', 'Beamten-Dienstrechtsgesetz', 'Beamten-Elite', 'Beamten-Erholungsheim', 'Beamten-Hierarchie', 'Beamten-Krankenversicherung', 'Beamten-Laufbahn', 'Beamten-Selbsthilfewerk', 'Beamten-Verein', 'Beamten-Versicherung', '

[]
error 'Managerinnen'

Metadatenwerkzeug ->
[]
[]
error 'Metadatenwerkzeug'

Migrantinnen ->
['Migrantin']
error 'Migrantinnen'

Monate ->
['Monat']
error 'Monate'

Neoautoritarismus ->
[]
[]
error 'Neoautoritarismus'

Neuronenaktivität ->
['Neuronenaktivität']
Neuronenaktivität

Organisationskompetenz ->
['Organisationskompetenz']
Organisationskompetenz

Personaldisposition ->
['Personaldisposition']
Personaldisposition

Portfolioanalyse ->
['Portfolioanalyse']
Portfolioanalyse

Premium-Hersteller ->
['Premium-Hersteller']
Premium-Hersteller

Probenehmer ->
['Probenehmer']
Probenehmer

Quartalsumfrage ->
[]
[]
error 'Quartalsumfrage'

Quelle ->
[]
['Quel', 'Quela', 'Quelae', 'Quelaines', 'Quelaines-Saint-Gault', 'Quelan', 'Quelapa', 'Quelcata', 'Quelccaya-Eiskappe', 'Quelch', 'Quele', 'Quele-Boro-Uai', 'Quelea', 'Quelea"-Art', 'Quelen', 'Queler', 'Quelet', 'Quelet-Reaktion', 'Quelfes', 'Quelhas', 'Quelicai', 'Quelicais', 'Quelimane', 'Quelin', 'Queling', 'Quelite', 'Quelitz', 'Quelk

[]
['Restaurierungsmethode', 'Restaurierungsmethodik']
Restaurierungsmethoden

Samenzahnrad ->
[]
[]
error 'Samenzahnrad'

Sandwich-Konzept ->
['Sandwich-Konzept']
Sandwich-Konzept

Schaden ->
[]
['Schad', "Schad'n", 'Schad)Software', 'Schad-Ros', 'Schad-Rossas', 'Schadaeus', 'Schadan', 'Schadanfälligkeit', 'Schadans', 'Schadau', 'Schadaupark', 'Schadaus', 'Schadausmaß', 'Schadbach', 'Schadberg', 'Schadbestand', 'Schadbild', 'Schadburg', 'Schadbär', 'Schadchen', 'Schadcode', 'Schadda', 'Schaddach', 'Schaddad', 'Schaddadide', 'Schaddadiden', 'Schaddai', 'Schaddaj', 'Schaddel', 'Schaddelmühle', 'Schaddingsdorf', 'Schaddisposition', 'Schaddād', 'Schade-Ahausen', 'Schade-Lindig', 'Schade-Salwey', 'Schadeberg', 'Schadeberg-Herrmann', 'Schadebeuster', 'Schadebrodt', 'Schadeburg', 'Schadeburgstraße', 'Schadeck', 'Schadee', 'Schadeffekt', 'Schadefähre', 'Schadegan', 'Schadegard', 'Schadegards', 'Schadegeld', 'Schadegg', 'Schadegur', 'Schadehop', 'Schadehorn', 'Schadeinheit', 'Schadeinwirkung',

[]
[]
error 'Sports-Tourer'

Studierende ->
[]
['Studieren', 'Studierende(r', 'Studierenden-Nationalmannschaft', 'Studierenden-Parlament', 'Studierenden-Service-Center', 'Studierenden-Sozialerhebung', 'Studierenden-Team', 'Studierenden-Weltmeisterschaft', 'Studierendenakademie', 'Studierendenangelegenheit', 'Studierendenanzahl', 'Studierendenapartment', 'Studierendenausschuss', 'Studierendenaustausch', 'Studierendenauswahl', 'Studierendenausweis', 'Studierendenbeitrag', 'Studierendenberatung', 'Studierendenbereich', 'Studierendenbetreuung', 'Studierendenbewegung', 'Studierendenchapters', 'Studierendendichte', 'Studierendenexekutive', 'Studierendenfachbereich', 'Studierendenfilmfestival', 'Studierendenforum', 'Studierendenförderung', 'Studierendengemein', 'Studierendengemeinde', 'Studierendengemeinschaft', 'Studierendengesellschaft', 'Studierendengruppe', 'Studierendenhaus', 'Studierendenheim', 'Studierendeninitiative', 'Studierendenjahrgang', 'Studierendenkongress', 'Studierendenmobili