### Vienna Analysis

We will be analysing the texts from our Yearbooks, from 1870-2009. The data is already cleaned and organised, so the purpose of this notebook is to use this cleaned data to create the data structures we need for two kinds of analysis: an evolving dynamic topic model, and creating a word embedding model on the entire data set so that we can capture semantic ideaologies.

1870–1913: pre Red Vienna (could start with 1890 to balance length of period)

1918–1935: Red Vienna

1946–1968: Reconstruction

1969–1989: Iron curtain

1990–2009: Postmodernity 

Dynamic Topic Models backwards - where do these topics come from?

Move in fighting poverty from decentralised to centralised. 

#### Imports and Data

In [25]:
import numpy as np
import spacy
import gensim
import csv
import pandas as pd

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
docs = {}

In [3]:
with open('yearbooks_tibble_all.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        year, page, term, count, decade, total = row
        if (year, page) not in docs:
            docs[(year, page)] = []
        if (year, page) in docs:
            docs[(year, page)].append(term)   

In [4]:
del docs[('year', 'page')]

In [5]:
docs[('1970', '56')]

['bauteil',
 'bauwerk',
 'bedeut',
 'bett',
 'doebling',
 'fertigstell',
 'gestieg',
 'instandsetzungsarbeit',
 'international',
 'kirch',
 'kultur',
 'million',
 'otto',
 'renovi',
 'schilling',
 'stad',
 'stadt',
 'steinhof',
 'studentenheim',
 'volksbild',
 'vorgeseh',
 'wagn',
 'zweit']

### Dynamic Topic Model

1870–1913: pre Red Vienna (could start with 1890 to balance length of period)

1918–1935: Red Vienna

1946–1968: Reconstruction

1969–1989: Iron curtain

1990–2009: Postmodernity 

#### New time periods

1870-1880

1881-1895

1896-1902

1903-1908

1909-1913

1918-1935

1946-1954

1955-1959

1960-1964

1965-1968

1969-1979

1980-1989

1990-1996

1997-2009



In [None]:
# timed_docs = [[], [], [], [], []]

In [None]:
# for doc in docs:
#     year, page = doc
#     if int(year) >= 1870 and int(year) <=1913:
#         timed_docs[0].append(docs[doc])
#     if int(year) >= 1918 and int(year) <=1935:
#         timed_docs[1].append(docs[doc])
#     if int(year) >= 1946 and int(year) <=1968:
#         timed_docs[2].append(docs[doc])
#     if int(year) >= 1969 and int(year) <=1989:
#         timed_docs[3].append(docs[doc])
#     if int(year) >= 1990 and int(year) <=2009:
#         timed_docs[4].append(docs[doc])

In [6]:
# 14 time periods
timed_docs = [[], [], [], [], [], [], [], [], [], [], [], [], [], []]

In [7]:
for doc in docs:
    year, page = doc
    if int(year) >= 1870 and int(year) <=1880:
        timed_docs[0].append(docs[doc])
    if int(year) >= 1881 and int(year) <=1895:
        timed_docs[1].append(docs[doc])
    if int(year) >= 1896 and int(year) <=1902:
        timed_docs[2].append(docs[doc])
    if int(year) >= 1903 and int(year) <=1908:
        timed_docs[3].append(docs[doc])
    if int(year) >= 1909 and int(year) <=1913:
        timed_docs[4].append(docs[doc])
    if int(year) >= 1918 and int(year) <=1935:
        timed_docs[5].append(docs[doc])
    if int(year) >= 1946 and int(year) <=1954:
        timed_docs[6].append(docs[doc])
    if int(year) >= 1955 and int(year) <=1959:
        timed_docs[7].append(docs[doc])
    if int(year) >= 1960 and int(year) <=1964:
        timed_docs[8].append(docs[doc])
    if int(year) >= 1965 and int(year) <=1968:
        timed_docs[9].append(docs[doc])
    if int(year) >= 1969 and int(year) <=1979:
        timed_docs[10].append(docs[doc])
    if int(year) >= 1980 and int(year) <=1989:
        timed_docs[11].append(docs[doc])
    if int(year) >= 1990 and int(year) <=1996:
        timed_docs[12].append(docs[doc])
    if int(year) >= 1997 and int(year) <=2009:
        timed_docs[13].append(docs[doc])

In [8]:
timed_docs[13][0]

['ftt', 'stadt', 'verwalt']

In [9]:
timed_docs[9][0]

['herausgegeb', 'stadt']

In [10]:
timed_docs[5][0]

['kis', 'len', 'oie', 'rot', 'uli', 'uni', 'vei']

In [11]:
final_docs = []
times = []

In [12]:
for period in timed_docs:
    times.append(len(period))
    for doc in period:
        final_docs.append(doc)

In [13]:
from gensim.models import LdaSeqModel
from gensim.corpora import Dictionary

In [14]:
# bigram = gensim.models.Phrases(final_docs)

In [15]:
# texts = [bigram[line] for line in final_docs]

In [16]:
dictionary = Dictionary(final_docs)
corpus = [dictionary.doc2bow(text) for text in final_docs]

In [17]:
len(dictionary)

178872

## Backwards Model with Sufficient Statistics

In [18]:
ldaseq_fast = LdaSeqModel.load("ldaseq_rev_fast_32_14periods_CV001")

In [19]:
sstats = ldaseq_fast.sstats

In [20]:
rev_corpus = list(reversed(corpus))

In [21]:
rev_times = list(reversed(times))

In [None]:
ldaseq_rev = LdaSeqModel(corpus=rev_corpus, id2word=dictionary, time_slice=rev_times, num_topics=32, chunksize=1, chain_variance=0.05, initialize='own', sstats=sstats, random_state=0)

  convergence = np.fabs((bound - old_bound) / old_bound)


In [28]:
ldaseq_rev.save("ldaseq_32_topics_14_periods_reverse_cv005") # 0.05

In [None]:
# ldaseq_rev_fast = LdaSeqModel(corpus=rev_corpus, id2word=dictionary, time_slice=rev_times, num_topics=5, chunksize=1, chain_variance=0.05)

In [None]:
# ldaseq_rev.print_topics(time=0)[9]

In [None]:
# ldaseq_rev.print_topics(time=3)[9]po

In [None]:
# ldaseq_rev_fast.print_topics(time=0)[9]

In [None]:
# ldaseq_rev_fast.print_topics(time=3)[9]

In [None]:
# ldaseq.save("ldaseq_5")

In [None]:
# ldaseq_rev.save("ldaseq_rev_5") 

In [None]:
# ldaseq_fast.save("ldaseq_fast_5")

In [None]:
# ldaseq_rev_fast.save("ldaseq_rev_fast_5")

## Coherence Values

In [None]:
from gensim.models.coherencemodel import CoherenceModel

In [None]:
topics_dtm = ldaseq.dtm_coherence(time=2)

In [None]:
cm_DTM = CoherenceModel(topics=topics_dtm, corpus=corpus, dictionary=dictionary, coherence='u_mass')

In [None]:
print ("DTM Python coherence is", cm_DTM.get_coherence())

In [None]:
texts = pickle.load(open('Corpus/texts', 'rb'))

In [None]:
cm_DTM = CoherenceModel(topics=topics_dtm, texts=texts, dictionary=dictionary, coherence='c_v')

In [None]:
print ("DTM Python coherence is", cm_DTM.get_coherence())

### German to English dictionary

In [None]:
import csv

In [None]:
ger_to_eng = {}

In [None]:
with open('yearbooksGERtoENG.csv') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        ger, eng = row
        ger_to_eng[ger] = eng

In [None]:
missing_words = []

In [None]:
for doc in final_docs:
    for word in doc:
        if word not in ger_to_eng and word not in missing_words:
            missing_words.append(word)

In [None]:
missing_words

In [None]:
def topics_in_english(topic_model, topic_number, time_slice):
    words  = topic_model.print_topics(time=time_slice)[topic_number]
    eng_words = []
    for word in words:
        ger_word, proportion = word
        try:
            eng_words.append((ger_to_eng[ger_word], proportion))
        except KeyError:
            eng_words.append((ger_word, proportion))            
    return eng_words

In [None]:
len(ger_to_eng)

In [None]:
topics_in_english(ldaseq_fast, 0, 0)

In [None]:
topics_in_english(ldaseq_fast, 0, 3)

In [None]:
import operator

### Change Tracker

In [None]:
def tracking_change(topic_model, topic_num, first_time_period=0, second_time_period=4):
    
    topics_begin = topic_model.print_topics(time=first_time_period)[topic_num]
    topics_end = topic_model.print_topics(time=second_time_period)[topic_num]
    
    word_ranks_begin = {}
    word_ranks_end = {}
    word_change = {}
    words_begin, words_end = [], []
    
    for num, word_prob in enumerate(topics_begin):
        word, prob = word_prob
        word_ranks_begin[word] = num
        words_begin.append(word)
            
    for num, word_prob in enumerate(topics_end):
        word, prob = word_prob
        word_ranks_end[word] = num
        words_end.append(word)

    for word in words_begin:
        if word not in word_ranks_end:
            word_ranks_end[word] = 21
    
    for word in words_end:
        if word not in word_ranks_begin:
            word_ranks_begin[word] = 21
    
    all_words = list(set(words_begin) | set(words_end)) 
    
    for word in all_words:
        word_change[word] = word_ranks_begin[word] - word_ranks_end[word]

    
    sorted_word_change = sorted(word_change.items(), key=operator.itemgetter(1))
    
    sorted_word_change.reverse()
    
    return sorted_word_change
                                                                    

In [None]:
# change here to track prevalance. since there are so many words, the word change probabilities are not super useful... yet.
def tracking_change_prevalance(topic_model, topic_num, first_time_period=0, second_time_period=4):
    
    topics_begin = topic_model.print_topics(time=first_time_period)[topic_num]
    topics_end = topic_model.print_topics(time=second_time_period)[topic_num]
    
    word_prevalance_begin = {}
    word_prevalance_end = {}
    word_change = {}
    words_begin, words_end = [], []
    all_probs = []
    
    for num, word_prob in enumerate(topics_begin):
        word, prob = word_prob
        word_prevalance_begin[word] = prob
        words_begin.append(word)
        all_probs.append(prob)
        
    for num, word_prob in enumerate(topics_end):
        word, prob = word_prob
        word_prevalance_end[word] = prob
        words_end.append(word)
        all_probs.append(prob)
    
    min_prob = min(all_probs)
    
    for word in words_begin:
        if word not in word_prevalance_end:
            word_prevalance_end[word] = min_prob
    
    for word in words_end:
        if word not in word_prevalance_begin:
            word_prevalance_begin[word] = min_prob
    
    all_words = list(set(words_begin) | set(words_end)) 
    
    for word in all_words:
        word_change[word] = word_prevalance_end[word] - word_prevalance_begin[word]

    
    sorted_word_change = sorted(word_change.items(), key=operator.itemgetter(1))
    
    sorted_word_change.reverse()
    
    return sorted_word_change
                                                                    

In [None]:
tracking_change_prevalance(ldaseq_rev, 0)

In [None]:
tracking_change(ldaseq_rev, 0)

In [None]:
tracking_change(ldaseq_rev_fast, 1)

In [None]:
def doc_topic_proportions(corpus, model, times, no_topics):
    
    max_proportions = {}
    
    for num, doc in enumerate(corpus):
        topic_proportions = model[doc]
        topics = np.nonzero(topic_proportions > 0.33)
        max_proportions[num] = topics[0]
        
    ranges = [0]
    for num in np.cumsum(times):
        ranges.append(num)
    
    time_period_counts = {}
    for i in range(0, len(times)):
        time_period_counts[i] = np.zeros(no_topics)
        
    for doc in max_proportions:
        if doc < ranges[1] and doc > ranges[0]:
            for val in max_proportions[doc]:
                time_period_counts[0][val] += 1
        if doc < ranges[2] and doc > ranges[1]:
            for val in max_proportions[doc]:
                time_period_counts[1][val] += 1
        if doc < ranges[3] and doc > ranges[2]:
            for val in max_proportions[doc]:
                time_period_counts[2][val] += 1
        if doc < ranges[4] and doc > ranges[3]:
            for val in max_proportions[doc]:
                time_period_counts[3][val] += 1
        if doc < ranges[5] and doc > ranges[4]:
            for val in max_proportions[doc]:
                time_period_counts[4][val] += 1
    
    time_period_proportions = {}

    for time in time_period_counts:
        time_period_proportions[time] = np.round(time_period_counts[time] / np.sum(time_period_counts[time]), 2)

    return time_period_proportions

In [None]:
time_period_proportions = doc_topic_proportions(corpus, ldaseq_fast, times, 5)

In [None]:
time_period_proportions

In [None]:
from scipy.stats import entropy

In [None]:
proportions  = {0: [0.15789474, 0.23578947, 0.06736842, 0.07368421, 0.46526316],
 1: [0.225     , 0.35416667, 0.1625    , 0.15833333, 0.1       ],
 2: [0.2034384 , 0.33524355, 0.15186246, 0.20630372, 0.10315186],
 3: [0.26519337, 0.32596685, 0.12707182, 0.15469613, 0.12707182],
 4: [0.25096525, 0.36679537, 0.17760618, 0.11583012, 0.08880309]}

In [None]:
entropy(proportions[0], proportions[1])

In [None]:
entropy(proportions[3], proportions[4])

In [None]:
ldaseq_fast.print_topics(time=4)[2]

In [None]:
tracking_change(ldaseq_fast, 2)

## Saving Models and Word Change to CSV

In [None]:
ldaseq_fast = LdaSeqModel.load("ldaseq_fast_filtered_2.5")

In [27]:
# change here to include all time periods in a single file
def topics_to_csv(model, name_file, times=14):
    with open(name_file, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile)
        for time in range(0, times):
            spamwriter.writerow(["Topic Number for time period " + str(time) , "Word, Probability"])
            topics = model.print_topics(time=time)
            for num, topic in enumerate(topics):
                topic.insert(0, num)
                spamwriter.writerow(topic)
                

In [28]:
topics_to_csv(ldaseq, "ldaseq_16_topics_14_periods_unfiltered.csv")

In [29]:
topics_to_csv(ldaseq_rev, "ldaseq_24_topics_14_periods_unfiltered_reverse.csv")

In [None]:
# change here to include time periods of choice: default is between first and last (i.e 0 and 4)
def change_to_csv(model, name_file, num_of_topics=7, first_time_period=0, second_time_period=4):
    with open(name_file, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile)
        spamwriter.writerow(["Topic Number", "Word, Change"])
        for i in range(0, num_of_topics):
            changes = tracking_change(model, i, first_time_period=first_time_period, second_time_period=second_time_period)
            changes.insert(0, i)
            spamwriter.writerow(changes)

In [None]:
# new format to match what wanted
def change_to_csv_advanced(model, name_file, num_of_topics=7, first_time_period=0, second_time_period=4):
    with open(name_file, 'w', newline='') as csvfile:
        spamwriter = csv.writer(csvfile)
        spamwriter.writerow(["Topic Number", "Deutsch", "Rank", "English"])
        for i in range(0, num_of_topics):
            changes = tracking_change(model, i, first_time_period=first_time_period, second_time_period=second_time_period)
#             changes.insert(0, i)
            for word_rank in changes:
                word, rank = word_rank
                try:
                    spamwriter.writerow(["Topic " + str(i), word, rank, ger_to_eng[word]])
                except KeyError:
                    spamwriter.writerow(["Topic " + str(i), word, rank, word])
#                 spamwriter.writerow(changes)

In [None]:
change_to_csv_advanced(ldaseq, "ldaseq_changes_12topics_new.csv", num_of_topics=12)

In [None]:
change_to_csv_advanced(ldaseq_fast, "ldaseq_fast_changes_12topics_all.csv", num_of_topics=5)

In [None]:
change_to_csv(ldaseq_fast, "ldaseq_fast_changes_5topics.csv", num_of_topics=5)

In [None]:
# tracking change in prevelance from period to period