In [1]:
from os.path import join

import numpy as np
import pandas as pd
from pygermanet import load_germanet, Synset
from tqdm._tqdm_notebook import tqdm_notebook

from constants import LDA_PATH
from utils import load

np.set_printoptions(precision=3)
gn = load_germanet()
tqdm_notebook.pandas()

In [2]:
def orth(synset):
    return synset.lemmas[0].orthForm

def compare_synset_lists(synset_list1, synset_list2, sim_func, agg_func):
    try:
        return agg_func(sim_func(ss1, ss2) for ss1 in synset_list1 for ss2 in synset_list2)
    except ValueError:
        return np.nan

def similarities(topic, ignore_unknown=True, sim_func=Synset.sim_lch, agg_func=max, topn=10):
    arr = np.zeros((topn, topn))
    for j, ssl1 in enumerate(topic.values):
        for k, ssl2 in enumerate(topic.values[j+1:], j+1):
            arr[j, k] = compare_synset_lists(ssl1, ssl2, sim_func, agg_func)
    arr = np.add(arr, arr.T)
    if ignore_unknown:
        arr[arr == 0] = np.nan
    return np.nanmean(arr)

In [4]:
dataset = 'O'
version, corpus_type, params, nbtopics, topn = 'noun', 'bow', 'e42', 10, 10

topics = load('topics', dataset, params, nbtopics)
sstopics = topics.applymap(gn.synsets)

topics['lch'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lch, agg_func=max, topn=topn)
topics['lch_ignr_unkwn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lch, agg_func=max, topn=topn, ignore_unknown=False)
topics['res'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_res, agg_func=max, topn=topn)
topics['res_ignr_unkwn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_res, agg_func=max, topn=topn, ignore_unknown=False)
topics['jcn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.dist_jcn, agg_func=min, topn=topn)
topics['jcn_ignr_unkwn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.dist_jcn, agg_func=min, topn=topn, ignore_unknown=False)
topics['lin'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lin, agg_func=max, topn=topn)
topics['lin_ignr_unkwn'] = sstopics.progress_apply(similarities, axis=1, sim_func=Synset.sim_lin, agg_func=max, topn=topn, ignore_unknown=False)

topics = topics.iloc[:, topn:]
file = join(LDA_PATH, version, corpus_type, 'topics', f'{dataset}_{version}_{corpus_type}_topic-scores_germanet.csv')
print(f'Writing {file}')
#topics.to_csv(file)
topics

Reading ../data/preprocessed/LDAmodel/noun/bow/topics/OnlineParticipation_topic-candidates.csv
File b'../data/preprocessed/LDAmodel/noun/bow/topics/OnlineParticipation_topic-candidates.csv' does not exist
Loading topics via TopicsLoader
Loading dictionary from ../data/preprocessed/LDAmodel/noun/bow/OnlineParticipation_noun_bow.dict
Loading model from ../data/preprocessed/LDAmodel/noun/bow/e42/OnlineParticipation_LDAmodel_e42_10_ep30


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Writing ../data/preprocessed/LDAmodel/noun/bow/topics/O_noun_bow_topic-scores_germanet.csv


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lch,lch_ignr_unkwn,res,res_ignr_unkwn,jcn,jcn_ignr_unkwn,lin,lin_ignr_unkwn
dataset,param_id,nb_topics,topic_idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
OnlineParticipation,e42,10,0,1.35392,1.188808,1.341991,1.178334,11.520278,10.115366,0.200826,0.176335
OnlineParticipation,e42,10,1,1.454871,1.309384,2.810047,2.529042,11.530984,10.377885,0.331,0.2979
OnlineParticipation,e42,10,2,1.623207,1.425255,1.766787,1.551325,12.962648,11.381838,0.224263,0.196914
OnlineParticipation,e42,10,3,1.315414,1.183873,3.044576,2.740118,13.126385,11.813746,0.315573,0.284015
OnlineParticipation,e42,10,4,1.305999,1.14673,1.61844,1.421069,11.249432,9.87755,0.221145,0.194176
OnlineParticipation,e42,10,5,1.444321,1.299889,2.803389,2.52305,16.249276,14.624348,0.279213,0.251292
OnlineParticipation,e42,10,6,1.395183,1.255664,1.906301,1.715671,11.330398,10.197358,0.246626,0.221964
OnlineParticipation,e42,10,7,1.512617,1.361356,2.094268,1.884841,12.247212,11.02249,0.26778,0.241002
OnlineParticipation,e42,10,8,1.396267,1.25664,2.086485,1.877836,13.303764,11.973388,0.234291,0.210862
OnlineParticipation,e42,10,9,1.546017,1.357479,2.051062,1.800933,14.184525,12.454705,0.239029,0.209879
