### Vienna Fin de secile 

We will be analysing the texts from our Yearbooks, particularly from 1870-1913. The data is already cleaned and organised, so the purpose of this notebook is to use this cleaned data to create the data structures we need for two kinds of analysis: an evolving dynamic topic model, and creating a word embedding model on the entire data set so that we can capture semantic ideaologies.

1870-1881
1882-1897
1897-1906
1907-1913

Dynamic Topic Models backwards - where do these topics come from?

Move in fighting poverty from decentralised to centralised. 

#### Imports and Data

In [1]:
import numpy as np
import spacy
import gensim
import csv
import pandas as pd

In [2]:
docs = {}

In [5]:
with open('yearbooks_tibble_unstemmed.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
#         print(row)
        doc, term, count, year, page, decade, period, red, fineperiods, total, all_ = row
        
#         year, page, term, count, doc, decade, period, total, all_ = row
        if (year, page) not in docs:
            docs[(year, page)] = []
        if (year, page) in docs:
            docs[(year, page)].append(term)   

In [6]:
del docs[('year', 'page')]

In [7]:
docs[('1870', '568')]

['abgabe',
 'abgeben',
 'abgebenen',
 'allgemeinen',
 'alter',
 'amtshandlung',
 'angewiesen',
 'anspruch',
 'anstalt',
 'antrag',
 'anweisung',
 'armen',
 'armenkinderpflege',
 'armenpflege',
 'aufnahme',
 'aufsicht',
 'ausbezahlt',
 'ausgenommen',
 'ausser',
 'ausserhalb',
 'austritt',
 'beamten',
 'bedarf',
 'bekleidung',
 'bereits',
 'beschlossen',
 'besonders',
 'bestehenden',
 'bestellten',
 'besuchen',
 'beteilung',
 'betrage',
 'bewaehrten',
 'bezirke',
 'blatter',
 'bleibende',
 'dienst',
 'dritten',
 'duerfen',
 'eigentliche',
 'einteilung',
 'eintritt',
 'eintritte',
 'erfolgt',
 'erforderlichen',
 'ergibt',
 'erhalten',
 'errichtung',
 'erwerbs',
 'eventuellen',
 'falle',
 'finden',
 'fuehrt',
 'gattin',
 'gebiete',
 'gebunden',
 'gefuehrt',
 'gelegene',
 'gemeindebezirke',
 'genommen',
 'gestatten',
 'gestorben',
 'gilt',
 'gruenden',
 'grundsatz',
 'gut',
 'haenden',
 'hauser',
 'hauses',
 'innerhalb',
 'inzwischen',
 'jahre',
 'jahren',
 'kind',
 'kinder',
 'knaben',
 'k

### Dynamic Topic Model

"1870-1883"
"1884-1896"
"1897-1905"
"1906-1909"
"1910-1913”


In [6]:
timed_docs = [[], [], [], [], []]

In [7]:
for doc in docs:
    year, page = doc
    if int(year) >= 1870 and int(year) <=1883:
        timed_docs[0].append(docs[doc])
    if int(year) >= 1884 and int(year) <=1896:
        timed_docs[1].append(docs[doc])
    if int(year) >= 1897 and int(year) <=1905:
        timed_docs[2].append(docs[doc])
    if int(year) >= 1906 and int(year) <=1909:
        timed_docs[3].append(docs[doc])
    if int(year) >= 1910 and int(year) <=1913:
        timed_docs[4].append(docs[doc])

In [8]:
final_docs = []
times = []

In [9]:
for period in timed_docs:
    times.append(len(period))
    for doc in period:
        final_docs.append(doc)

In [10]:
from gensim.models import LdaSeqModel
from gensim.corpora import Dictionary

In [11]:
# bigram = gensim.models.Phrases(final_docs)

In [12]:
# texts = [bigram[line] for line in final_docs]

In [13]:
dictionary = Dictionary(final_docs)
corpus = [dictionary.doc2bow(text) for text in final_docs]

In [14]:
ldaseq = LdaSeqModel.load("ldaseq_5")

In [15]:
# ldaseq = LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=times, num_topics=5, chunksize=1)

In [16]:
# ldaseq.print_topics(time=0)[9]

In [17]:
# ldaseq.print_topics(time=3)[9]

In [18]:
# proportion of topics over time
# lesser topics so that we can see more evolution
# csv file of 0th and 3rd
# unigrams for next run
# allow more changes for each period
# backward topic models
# korporation comes up in prevelance
# functions to see change

In [19]:
ldaseq_fast = LdaSeqModel.load("ldaseq_fast_5")

In [20]:
# ldaseq_fast = LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=times, num_topics=5, chunksize=1, chain_variance=0.05)

In [21]:
# ldaseq_fast.print_topics(time=0)[8]

In [22]:
# ldaseq_fast.print_topics(time=3)[8]

In [23]:
# corpus.reverse()

In [24]:
# times.reverse()

In [25]:
# rev_corpus = list(reversed(corpus))

In [26]:
# rev_times = list(reversed(times))

In [27]:
ldaseq_rev = LdaSeqModel.load("ldaseq_rev_5")

In [28]:
ldaseq_rev_fast = LdaSeqModel.load("ldaseq_rev_fast_5")

In [29]:
# ldaseq_rev = LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=times, num_topics=5, chunksize=1)

In [30]:
# ldaseq_rev_fast = LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=times, num_topics=5, chunksize=1, chain_variance=0.05)

In [31]:
# ldaseq_rev.print_topics(time=0)[9]

In [32]:
# ldaseq_rev.print_topics(time=3)[9]

In [33]:
# ldaseq_rev_fast.print_topics(time=0)[9]

In [34]:
# ldaseq_rev_fast.print_topics(time=3)[9]

In [35]:
# ldaseq.save("ldaseq_5")

In [36]:
# ldaseq_rev.save("ldaseq_rev_5")

In [37]:
# ldaseq_fast.save("ldaseq_fast_5")

In [38]:
# ldaseq_rev_fast.save("ldaseq_rev_fast_5")

In [39]:
import csv

In [40]:
ger_to_eng = {}

In [41]:
with open('yearbooksGERtoENG.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        ger, eng = row
        ger_to_eng[ger] = eng

In [42]:
missing_words = []

In [43]:
for doc in final_docs:
    for word in doc:
        if word not in ger_to_eng and word not in missing_words:
            missing_words.append(word)

In [44]:
missing_words

['stiftnngsinteress',
 'werkhans',
 'armemves',
 'deeemb',
 'van',
 'crzten',
 'verpflegsgebu',
 'geba',
 'abzufuehr',
 'gebu',
 'dcbelstaend',
 'armcnwes',
 'verteilnng',
 'vorjaehr',
 'armenweseii',
 'curerfolg',
 'sechswoechent',
 'taegig',
 'beruecksichtigt',
 'capital',
 'crzte',
 'arinenwes',
 'instruction',
 'bezirksarmenraet',
 'vielmehr',
 'aeuss',
 'jahrbuch',
 'massnahm',
 'auslaend',
 'entspricht',
 'uber',
 'erhaltungsbeitraeg',
 'reinertraegnis',
 'adaptier',
 'zahlpflegling',
 'armenkinderxfleg',
 'sieh',
 'san',
 'uud',
 'armenweseil',
 'stadtrat',
 'versicherungsbeitraeg',
 'bezirksaemt',
 'amtsaerzt',
 'krankenanstaltenfond',
 'abloes',
 'waeld',
 'solang',
 'xlx',
 'frequenzfaell',
 'kommunalplaetz',
 'jahrbuech',
 'spitalpfleg',
 'dgl',
 'cquivalent',
 'stadtratsbeschlussesvom',
 'krem',
 'jsolierpavillon',
 'abzuhelf',
 'blaett',
 'armeiiwes',
 'jrrenhausfond',
 'desgleich',
 'keinerlei',
 'maennerasyl',
 'heilbaed',
 'zahnaerzt',
 'valeri',
 'kinderhort',
 'reichs

In [45]:
def topics_in_english(topic_model, topic_number, time_slice):
    words  = topic_model.print_topics(time=time_slice)[topic_number]
    eng_words = []
    for word in words:
        ger_word, proportion = word
        try:
            eng_words.append((ger_to_eng[ger_word], proportion))
        except KeyError:
            eng_words.append((ger_word, proportion))            
    return eng_words

In [46]:
len(ger_to_eng)

12183

In [47]:
topics_in_english(ldaseq_fast, 0, 0)

[('receive', 0.010754727694773336),
 ('institution', 0.010252086803703927),
 ('poor', 0.006863057664176687),
 ('general', 0.006615412582630152),
 ('display', 0.006508949789234518),
 ('recorded', 0.006502533112836464),
 ('housed', 0.006212435318177244),
 ('located', 0.006165315798980222),
 ('person', 0.006102322193033307),
 ('cost', 0.006026094367558108),
 ('child', 0.005986640891899897),
 ('certainly', 0.005900034614230641),
 ('ready', 0.005766180594775818),
 ('exist', 0.005185621823100179),
 ('take care', 0.00437759467054447),
 ('was standing', 0.004161556494547738),
 ('number', 0.003864280894748425),
 ('administered', 0.0038159287733107844),
 ('supply', 0.003753969312885847),
 ('Every day', 0.003753518481906581)]

In [48]:
topics_in_english(ldaseq_fast, 0, 3)

[('poor man', 0.006711526863857177),
 ('institution', 0.006630079952225058),
 ('caring', 0.00631372839380569),
 ('person', 0.005858497510367664),
 ('receive', 0.00511497532388471),
 ('poor care', 0.005097636951179339),
 ('ready', 0.0047158491102408965),
 ('housed', 0.004330370013983238),
 ('cost', 0.004315759189867077),
 ('poor', 0.004252498257713914),
 ('recorded', 0.0037901960961634487),
 ('take care', 0.0037834665937442856),
 ('supply house', 0.0036913426552337984),
 ('House', 0.0035989769614703584),
 ('number', 0.003487434815977912),
 ('old', 0.003441052294827931),
 ('display', 0.0034398716549017875),
 ('approved', 0.003413875613399699),
 ('care home', 0.0033424101692449745),
 ('poor supply', 0.0032533042335896496)]

In [49]:
import operator

In [51]:
def tracking_change(topic_model, topic_num, first_time_period=0, second_time_period=4):
    
    topics_begin = topic_model.print_topics(time=first_time_period)[topic_num]
    topics_end = topic_model.print_topics(time=second_time_period)[topic_num]
    
    word_ranks_begin = {}
    word_ranks_end = {}
    word_change = {}
    words_begin, words_end = [], []
    
    for num, word_prob in enumerate(topics_begin):
        word, prob = word_prob
        word_ranks_begin[word] = num
        words_begin.append(word)
            
    for num, word_prob in enumerate(topics_end):
        word, prob = word_prob
        word_ranks_end[word] = num
        words_end.append(word)

    for word in words_begin:
        if word not in word_ranks_end:
            word_ranks_end[word] = 21
    
    for word in words_end:
        if word not in word_ranks_begin:
            word_ranks_begin[word] = 21
    
    all_words = list(set(words_begin) | set(words_end)) 
    
    for word in all_words:
        word_change[word] = word_ranks_begin[word] - word_ranks_end[word]

    
    sorted_word_change = sorted(word_change.items(), key=operator.itemgetter(1))
    
    sorted_word_change.reverse()
    
    return sorted_word_change
                                                                    

In [52]:
# change here to track prevalance. since there are so many words, the word change probabilities are not super useful... yet.
# should chat about this when we meet next
def tracking_change_prevalance(topic_model, topic_num, first_time_period=0, second_time_period=4):
    
    topics_begin = topic_model.print_topics(time=first_time_period)[topic_num]
    topics_end = topic_model.print_topics(time=second_time_period)[topic_num]
    
    word_prevalance_begin = {}
    word_prevalance_end = {}
    word_change = {}
    words_begin, words_end = [], []
    all_probs = []
    
    for num, word_prob in enumerate(topics_begin):
        word, prob = word_prob
        word_prevalance_begin[word] = prob
        words_begin.append(word)
        all_probs.append(prob)
        
    for num, word_prob in enumerate(topics_end):
        word, prob = word_prob
        word_prevalance_end[word] = prob
        words_end.append(word)
        all_probs.append(prob)
    
    min_prob = min(all_probs)
    
    for word in words_begin:
        if word not in word_prevalance_end:
            word_prevalance_end[word] = min_prob
    
    for word in words_end:
        if word not in word_prevalance_begin:
            word_prevalance_begin[word] = min_prob
    
    all_words = list(set(words_begin) | set(words_end)) 
    
    for word in all_words:
        word_change[word] = word_prevalance_end[word] - word_prevalance_begin[word]

    
    sorted_word_change = sorted(word_change.items(), key=operator.itemgetter(1))
    
    sorted_word_change.reverse()
    
    return sorted_word_change
                                                                    

In [53]:
tracking_change_prevalance(ldaseq_rev, 0)

[('anstalt', 0.0010942715669252888),
 ('stand', 0.0008532466888341687),
 ('zahl', 0.0007826466552624273),
 ('kind', 0.0007464512714123236),
 ('person', 0.0006584585372487279),
 ('kost', 0.0006170281868458352),
 ('waehrend', 0.0005032420331909215),
 ('auslag', 0.0004114964408793555),
 ('waisenhaus', 0.00038352143677943245),
 ('aufnahm', 0.00028878242780602543),
 ('aerztlich', 0.00028754264223521033),
 ('verpflegt', 0.00016800192737887604),
 ('arm', 0.00016354360571541916),
 ('knab', 0.00011152335902850016),
 ('durchschnitt', 7.910953534691345e-05),
 ('erhalt', 6.288894958691917e-05),
 ('untergebracht', 0.0),
 ('unentgelt', -7.973217781807873e-05),
 ('maedch', -0.00010406391322183208),
 ('verpfleg', -0.00020914856975816007),
 ('maennlich', -0.00043461577251965346),
 ('tag', -0.0005406452414673975)]

In [54]:
tracking_change(ldaseq_rev, 0)

[('stand', 5),
 ('aerztlich', 4),
 ('person', 4),
 ('zahl', 4),
 ('durchschnitt', 2),
 ('waehrend', 1),
 ('aufnahm', 1),
 ('kost', 1),
 ('erhalt', 0),
 ('anstalt', 0),
 ('kind', 0),
 ('knab', 0),
 ('waisenhaus', 0),
 ('maedch', -1),
 ('auslag', -1),
 ('arm', -1),
 ('verpflegt', -1),
 ('unentgelt', -1),
 ('untergebracht', -2),
 ('tag', -2),
 ('verpfleg', -5),
 ('maennlich', -8)]

In [55]:
tracking_change(ldaseq_rev_fast, 1)

[('besteh', 17),
 ('bereit', 15),
 ('bezueg', 10),
 ('erfolgt', 9),
 ('stadt', 7),
 ('interess', 6),
 ('betrag', 6),
 ('person', 5),
 ('word', 5),
 ('armenpfleg', 4),
 ('arm', 3),
 ('weit', 3),
 ('armenbezirk', 2),
 ('zweck', 1),
 ('allgemein', 1),
 ('bestimm', 0),
 ('unterstuetz', -1),
 ('betreff', -2),
 ('armenwes', -2),
 ('gesetz', -3),
 ('bestimmt', -4),
 ('stiftung', -5),
 ('erricht', -5),
 ('kapital', -6),
 ('kind', -9),
 ('zahl', -10),
 ('folgend', -11),
 ('vermoeg', -16),
 ('fond', -20)]

In [56]:
def doc_topic_proportions(corpus, model, times, no_topics):
    
    max_proportions = {}
    
    for num, doc in enumerate(corpus):
        topic_proportions = model[doc]
        topics = np.nonzero(topic_proportions > 0.33)
        max_proportions[num] = topics[0]
        
    ranges = [0]
    for num in np.cumsum(times):
        ranges.append(num)
    
    time_period_counts = {}
    for i in range(0, len(times)):
        time_period_counts[i] = np.zeros(no_topics)
        
    for doc in max_proportions:
        if doc < ranges[1] and doc > ranges[0]:
            for val in max_proportions[doc]:
                time_period_counts[0][val] += 1
        if doc < ranges[2] and doc > ranges[1]:
            for val in max_proportions[doc]:
                time_period_counts[1][val] += 1
        if doc < ranges[3] and doc > ranges[2]:
            for val in max_proportions[doc]:
                time_period_counts[2][val] += 1
        if doc < ranges[4] and doc > ranges[3]:
            for val in max_proportions[doc]:
                time_period_counts[3][val] += 1
        if doc < ranges[5] and doc > ranges[4]:
            for val in max_proportions[doc]:
                time_period_counts[4][val] += 1
    
    time_period_proportions = {}

    for time in time_period_counts:
        time_period_proportions[time] = np.round(time_period_counts[time] / np.sum(time_period_counts[time]), 2)

    return time_period_proportions

In [57]:
time_period_proportions = doc_topic_proportions(corpus, ldaseq_fast, times, 5)

In [58]:
time_period_proportions

{0: array([0.15789474, 0.23578947, 0.06736842, 0.07368421, 0.46526316]),
 1: array([0.225     , 0.35416667, 0.1625    , 0.15833333, 0.1       ]),
 2: array([0.2034384 , 0.33524355, 0.15186246, 0.20630372, 0.10315186]),
 3: array([0.26519337, 0.32596685, 0.12707182, 0.15469613, 0.12707182]),
 4: array([0.25096525, 0.36679537, 0.17760618, 0.11583012, 0.08880309])}

In [60]:
ldaseq_fast.print_topics(time=4)[2]

[('person', 0.009592862048061661),
 ('krank', 0.006899718294721143),
 ('kost', 0.006558729833661),
 ('arm', 0.0055836104672836884),
 ('unentgelt', 0.0053879770089733355),
 ('zufolg', 0.005326531827128796),
 ('aerztlich', 0.005313259758325845),
 ('tag', 0.005205118582778899),
 ('zahl', 0.00519732106753351),
 ('erhalt', 0.004838309134514186),
 ('armenwes', 0.004794930969439805),
 ('fuersorg', 0.004670779812433583),
 ('weit', 0.004503958899946665),
 ('pfleg', 0.004487984946031832),
 ('betrag', 0.0044679376647961),
 ('bad', 0.004311841999261475),
 ('heimatberechtigt', 0.004293355370472746),
 ('zustaend', 0.004255583641256706),
 ('stadt', 0.004180054041896383),
 ('zweck', 0.004118677522181464)]

In [61]:
tracking_change(ldaseq_fast, 2)

[('zufolg', 16),
 ('tag', 14),
 ('armenwes', 11),
 ('fuersorg', 10),
 ('weit', 9),
 ('pfleg', 8),
 ('zustaend', 4),
 ('krank', 3),
 ('bad', 3),
 ('person', 3),
 ('stadt', 3),
 ('zahl', 3),
 ('zweck', 2),
 ('aerztlich', 1),
 ('betrag', 0),
 ('kost', 0),
 ('erhalt', -1),
 ('heimatberechtigt', -1),
 ('anstalt', -2),
 ('arm', -2),
 ('unentgelt', -4),
 ('daselb', -4),
 ('heilanstalt', -5),
 ('allgemein', -8),
 ('spital', -9),
 ('jaehrlich', -11),
 ('verpflegt', -12),
 ('kind', -15),
 ('auslag', -16)]

In [63]:
# change here to include all time periods in a single file
def topics_to_csv(model, name_file, times=14):
    with open(name_file, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile)
        for time in range(0, times):
            spamwriter.writerow(["Topic Number for time period " + str(time) , "Word, Probability"])
            topics = model.print_topics(time=time, top_terms=30)
            for num, topic in enumerate(topics):
                topic.insert(0, num)
                spamwriter.writerow(topic)
                

In [64]:
topics_to_csv(ldaseq_fast, "ldaseq_fast_5topics.csv")

In [65]:
# change here to include time periods of choice: default is between first and last (i.e 0 and 4)
def change_to_csv(model, name_file, num_of_topics=7, first_time_period=0, second_time_period=4):
    with open(name_file, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile)
        spamwriter.writerow(["Topic Number", "Word, Change"])
        for i in range(0, num_of_topics):
            changes = tracking_change(model, i, first_time_period=first_time_period, second_time_period=second_time_period)
            changes.insert(0, i)
            spamwriter.writerow(changes)

In [81]:
# new format to match what Christof wanted
def change_to_csv_advanced(model, name_file, num_of_topics=7, first_time_period=0, second_time_period=4):
    with open(name_file, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile)
        spamwriter.writerow(["Topic Number", "Deutsch", "Rank", "English"])
        for i in range(0, num_of_topics):
            changes = tracking_change(model, i, first_time_period=first_time_period, second_time_period=second_time_period)
#             changes.insert(0, i)
            for word_rank in changes:
                word, rank = word_rank
                try:
                    spamwriter.writerow(["Topic " + str(i), word, rank, ger_to_eng[word]])
                except KeyError:
                    spamwriter.writerow(["Topic " + str(i), word, rank, word])
#                 spamwriter.writerow(changes)

In [82]:
change_to_csv_advanced(ldaseq_fast, "ldaseq_fast_changes_5topics_new.csv", num_of_topics=5)

In [66]:
change_to_csv(ldaseq_fast, "ldaseq_fast_changes_5topics.csv", num_of_topics=5)

In [None]:
# tracking change in prevelance from period to period