## First example of word2vec

In [51]:
from gensim.models import Word2Vec
from gensim.models.ldamodel import LdaModel
from gensim import corpora
import spacy
import pandas as pd

In [2]:
# Load german spacy model
nlp = spacy.load('de_core_news_sm')

# Read and process christmas carol
# Read christmas carol data set
#with open('../datasets/christmas_carol.txt', 'r') as dickens:
#    the_book = dickens.read().decode('utf-8')
    
#doc = nlp(the_book)

In [8]:
# Read data
cognitive_load_data = pd.read_csv("../datasets/clt.csv", sep=",", encoding = "utf-8")

# Store all texts in long string
cognitive_load_long_string = ""

# Loop over every text
for document in cognitive_load_data["text"]:
    cognitive_load_long_string += document
    
doc = nlp(cognitive_load_long_string)

In [9]:
# Create corpus from data
documents = []

# Loop over every sentence
for sentence in doc.sents:
    tokens_in_sentence = []
    # Get every token
    for token in sentence:
        if token.is_alpha and token.is_stop != True and token.pos_ in ['NOUN']:
            tokens_in_sentence.append(token.lemma_.lower())
            
    # Add new sentence to documents
    documents.append(tokens_in_sentence)    

# Build dictionary from documents
dictionary = corpora.Dictionary(documents)

# Create corpus
corpus = [dictionary.doc2bow(sentence) for sentence in documents]

In [33]:
# Train word2vec
model = Word2Vec(documents, min_count=5)

In [34]:
print(documents)

[[u'lernproze\xdf', u'arbeitsged\xe4chtnis', u'kurzzeitged\xe4chtnis'], [u'arbeitsged\xe4chtnis', u'anstrengung', u'k\xf6rper', u'ressourcenkapazit\xe4t', u'verf\xfcgung'], [u'ph\xe4nomen', u'fokus'], [u'belastung', u'arbeitsged\xe4chtnisses'], [u'faktor', u'arbeitsspeicher', u'reizen', u'alltag', u'reaktion', u'reizen', u'bedeutung'], [u'beispiel', u'verarbeitung', u'reiz'], [u'stra\xdfenverkehr', u'voraussetzung', u'reizen', u'belastung', u'reizen', u'lernproze\xdf'], [u'grund', u'bibliothek', u'rede', u'l\xe4rmbel\xe4stigung'], [u'lernen', u'input', u'formen', u'text', u'langzeitged\xe4chtnis'], [u'lernprozess', u'platzen', u'input'], [u'einfluss', u'reizen', u'lernerfolg'], [], [u'einwirkung', u'reiz', u'lernende', u'arbeitsspeicher', u'fallen', u'reiz', u'arbeitsspeicher'], [u'stress', u'gedanke', u'beispiel'], [u'einflussfaktoren', u'lernerfolg', u'vorwissen', u'vorstellungsverm\xf6gen'], [u'faktor', u'inhalt', u'arbeitsspeicher', u'langzeitged\xe4chtnis'], [u'theorie', u'bildung

In [50]:
print(model)

# Get most similar words to a specific term
print(model.most_similar('vorwissen'))

# Get similarity between two terms
print(model.similarity('vorwissen', 'information'))

# Get vector representation of word
print(model.wv['vorwissen'])
print(model['information'])

# Play around with model
print(model.wv.most_similar(positive=['lehrer'], negative=['person']))

# Check what doesn't fit in the model
print(model.doesnt_match("arbeitsgedächnis information kurzeitspeicher langzeitspeicher".split()))

Word2Vec(vocab=124, size=100, alpha=0.025)
[(u'information', 0.5339909791946411), (u'sch\xfcler', 0.5263341665267944), (u'thema', 0.471238374710083), (u'lernende', 0.46889516711235046), (u'bezug', 0.4609374403953552), (u'belastung', 0.4495766758918762), (u'text', 0.4446403682231903), (u'aufmerksamkeit', 0.43844881653785706), (u'seite', 0.43264949321746826), (u'person', 0.413810670375824)]
0.533990966602
[ -6.46224990e-03   2.14940333e-03   2.59605562e-03   9.52842180e-03
   4.60224925e-03  -2.76823342e-03  -3.51856649e-03   2.76701618e-03
  -4.05324809e-03   4.40452900e-03   8.60047899e-03  -2.10578064e-03
   6.81298319e-04   4.74661216e-03   7.04613980e-03  -4.19404963e-03
  -4.58842563e-03  -1.33072631e-03  -6.63532061e-04   3.82143119e-03
   7.67149962e-03   1.23298750e-03  -1.79579051e-03  -3.99582135e-03
  -7.45459693e-04   5.84665826e-03   3.55368573e-03   1.83640033e-04
  -1.59388047e-03  -1.83257274e-03  -2.97513581e-03  -6.12656772e-03
   7.64889596e-03  -4.29744506e-03   5.02

In [68]:
lda = LdaModel(corpus, num_topics=1, id2word = dictionary)

In [69]:
for topic, word in lda.show_topics():
    print(word)
    print('\n')

0.062*"information" + 0.056*"belastung" + 0.028*"lernen" + 0.026*"vorwissen" + 0.021*"lernende" + 0.018*"arbeitsgedächtnis" + 0.017*"beispiel" + 0.016*"gedächtnis" + 0.014*"langzeitgedächtnis" + 0.014*"arbeitsspeicher"


