In [55]:
import gensim.models as models

In [56]:
import gensim.corpora as corpora

In [57]:
#load models

In [58]:
lda_model = models.ldamodel.LdaModel.load(r'/home/carina/Documents/Uni/WS-19-20/NLP-Projekt/Git/nlp-projekt/notebook/modeling/lda_model')

In [59]:
lsi_model = models.LsiModel.load(r'//home/carina/Documents/Uni/WS-19-20/NLP-Projekt/Git/nlp-projekt/notebook/modeling/lsi_model')

In [60]:
fasttext_model = models.FastText.load(r'/home/carina/Documents/Uni/WS-19-20/NLP-Projekt/Git/nlp-projekt/notebook/modeling/fasttext_model')

In [61]:
doc2vec_model = models.Doc2Vec.load(r'/home/carina/Documents/Uni/WS-19-20/NLP-Projekt/Git/nlp-projekt/notebook/modeling/doc2vec_model')

In [62]:
word2vec_model = models.Word2Vec.load(r'/home/carina/Documents/Uni/WS-19-20/NLP-Projekt/Git/nlp-projekt/notebook/modeling/word2vec_model')

In [63]:
#Distance Metrics like Jaccard (LDA, LSI)

In [64]:
read_object = open(r'/home/carina/Documents/Uni/WS-19-20/NLP-Projekt/Git/nlp-projekt/data/processed/BWL_Corpus_cleaned.csv', encoding='UTF-8')

In [65]:
text_string = read_object.read()

In [66]:
read_object.close()

In [67]:
doc_text = text_string.replace(';', '')

In [68]:
doc_strings = doc_text.split('\n' + '\n') #array of strings per doc

In [69]:
documents = []

In [70]:
for doc in doc_strings:
    doc_tokens = doc.split(' ')
    result =list(filter(None, doc_tokens))
    documents.append(result) #array of tokens per doc

In [71]:
import itertools
import networkx as nx
from gensim.matutils import jaccard

In [72]:
def get_most_likely_topic(doc):
    bow = lda_model.id2word.doc2bow(doc)
    topics, probabilities = zip(*lda_model.get_document_topics(bow))
    max_p = max(probabilities)
    topic = topics[probabilities.index(max_p)]
    return topic

In [75]:
def get_node_color(i):
    return 'skyblue' if get_most_likely_topic(documents[i]) == 0 else 'pink'

In [98]:
G = nx.Graph()
for i, _ in enumerate(documents):
    G.add_node(i)
    
for (i1, i2) in itertools.combinations(range(len(documents)), 2):
    bow1, bow2 = documents[i1], documents[i2]
    distance = jaccard(bow1, bow2)
    if(distance != 0):
        G.add_edge(i1, i2, weight=1/distance)

#
# https://networkx.github.io/documentation/networkx-1.9/examples/drawing/weighted_graph.html
#
pos = nx.spring_layout(G)

threshold = 1.25
elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] > threshold]
esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] <= threshold]

node_colors = [get_node_color(i) for (i, _) in enumerate(documents)]
nx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors)
nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2)
nx.draw_networkx_edges(G,pos,edgelist=esmall, width=2, alpha=0.2, edge_color='b', style='dashed')
nx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif')

KeyError: dtype('float32')

In [None]:
#Principal Component Analysis (word2vec, doc2vec, fastext)

In [76]:
from sklearn.decomposition import IncrementalPCA    
from sklearn.manifold import TSNE               
import numpy as np                   

In [77]:
def reduce_dimensions(model):
    num_dimensions = 2  

    vectors = [] 
    labels = []
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

In [78]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

In [79]:
def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

In [89]:
x_vals, y_vals, labels = reduce_dimensions(word2vec_model)

In [90]:
try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)

In [None]:
#Instristic Evaluation Method (Perlexity via Coherence Model) (LSI, LSA)

In [81]:
vocabulary = corpora.Dictionary(documents)

In [82]:
bow_corpus = [vocabulary.doc2bow(document) for document in documents]

In [83]:
tdif_model = models.TfidfModel(bow_corpus)

In [84]:
tfdif_corpus = tdif_model[bow_corpus]

In [85]:
coherence = models.CoherenceModel(model=lda_model, corpus=tfdif_corpus,
                    dictionary=vocabulary, coherence='c_v',
                    texts=[[w for w in d if w in vocabulary.token2id] for d in documents])

In [86]:
coherence.get_coherence()

0.548551911788138

In [87]:
#Eye Ballin Method (LDA)

In [92]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

In [97]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, tfdif_corpus, vocabulary)
LDAvis_prepared

KeyError: dtype('float32')