# Importing the libraries and data

In [None]:
library('quanteda')
library('quanteda.textplots')
library("quanteda.textmodels")
library("topicmodels")
library("tidyverse")

In [None]:
inaugural <- corpus_subset(data_corpus_inaugural, Year <= 1826)

In [None]:
dfm_inaug <- tokens(inaugural, remove_punct = TRUE) %>% 
    dfm() %>%
    dfm_remove(stopwords('english')) %>%
    dfm_trim(min_termfreq = 10)

In [None]:
dfm_inaug

In [None]:
textplot_wordcloud(dfm_inaug)

In [None]:
tokens(inaugural, remove_punct = TRUE) %>% 
    dfm() %>%
    dfm_remove(stopwords('english')) %>%
    dfm_trim(min_termfreq = 10) %>%
    dfm_weight(scheme = 'count')
    # dfm_weight(scheme = 'boolean')

In [None]:
tokens(inaugural, remove_punct = TRUE) %>% 
    dfm() %>%
    dfm_remove(stopwords('english')) %>%
    dfm_trim(min_termfreq = 10) %>%
    # dfm_weight(scheme = 'count')
    # dfm_weight(scheme = 'boolean')
    # dfm_tfidf()
    dfm_tfidf(scheme_tf = "prop")

In [None]:
dfm_inaug <- tokens(inaugural, remove_punct = TRUE) %>% 
    dfm() %>%
    dfm_remove(stopwords('english')) %>%
    dfm_wordstem() %>%
    dfm_trim(min_termfreq = 10) %>%
    dfm_weight(scheme = 'count')

# LSA

In [None]:
inaug_lsa <- textmodel_lsa(dfm_inaug, nd=5)

In [None]:
inaug_lsa$docs

In [None]:
inaug_lsa$features

In [None]:
head(as.data.frame(inaug_lsa$docs))

In [None]:
inaug_lsa_longer <- rownames_to_column(as.data.frame(inaug_lsa$docs), var='speech') %>%
                    pivot_longer(cols = V1:V5, names_to = "topics")

In [None]:
inaug_lsa_longer

In [None]:
ggplot(inaug_lsa_longer, aes(speech,value,color=topics,fill=topics)) + 
geom_bar(stat='identity') + theme(aspect.ratio = 1) 

This reveals one of the cons of LSA -> negative values in the document-topic matrix.  This poses problems for interpretability.

In [None]:
predict(inaug_lsa, dfm_inaug[1,])$docs_newspace

In [None]:
inaug_lsa$sk

In [None]:
head(inaug_lsa$features)

In [None]:
word_topic_vals <- as.data.frame(inaug_lsa$features) %>%
                    rownames_to_column(var='words') %>%
                    pivot_longer(cols = V1:V5, names_to = "topics")
word_topic_vals

In [None]:
word_topic_vals_top5 <- word_topic_vals %>%
                        arrange(desc(value)) %>%
                        group_by(topics) %>%
                        slice(1:5)
word_topic_vals_top5

In [None]:
as.data.frame(word_topic_vals_top5)

In [None]:
ggplot(data = word_topic_vals_top5, 
       aes(x = nrow(word_topic_vals_top5):1, y = value)) +
geom_point() +
facet_wrap(~ topics, scales = "free") +
coord_flip() + 
scale_x_continuous(breaks = nrow(word_topic_vals_top5):1,
            labels = word_topic_vals_top5$words)

# LDA

In [None]:
dfm_inaug <- tokens(inaugural, remove_punct = TRUE) %>% 
    dfm() %>%
    dfm_remove(stopwords('english')) %>%
    dfm_wordstem() %>%
    dfm_trim(min_termfreq = 10) %>%
    dfm_weight(scheme = 'count')

In [None]:
lda5 <- convert(dfm_inaug, to = "topicmodels") %>% 
        LDA(k = 5)

In [None]:
terms(lda5, 10)

In [None]:
dfm_inaug <- tokens(inaugural, remove_punct = TRUE) %>% 
    dfm() %>%
    dfm_remove(stopwords('english')) %>%
    dfm_wordstem() %>%
    dfm_trim(min_termfreq = 10) %>%
    dfm_weight(scheme = 'boolean')

In [None]:
dfm_inaug

In [None]:
lda5 <- convert(dfm_inaug, to = "topicmodels") %>% 
        LDA(k = 5)

In [None]:
terms(lda5, 10)

In [None]:
topics(lda5)

In [None]:
lda5@beta

In [None]:
as.data.frame(t(lda5@beta), row.names = lda5@terms)

In [None]:
as.data.frame(lda5@gamma, row.names = lda5@documents)

In [None]:
doc_topic_longer <- as.data.frame(lda5@gamma, row.names = lda5@documents) %>%
                    rownames_to_column(var='speech') %>%
                    pivot_longer(cols = V1:V5, names_to = "topic")

In [None]:
doc_topic_longer

In [None]:
ggplot(doc_topic_longer, aes(value, speech, color=topic, fill=topic)) + 
geom_bar(stat='identity') + theme(aspect.ratio = 1) 

In [None]:
lda5_word_topic_vals <- as.data.frame(t(lda5@beta), row.names = lda5@terms)

In [None]:
word_topic_vals <- lda5_word_topic_vals %>%
                    rownames_to_column(var='words') %>%
                    pivot_longer(cols = V1:V5, names_to = "topics")
word_topic_vals

In [None]:
word_topic_vals_top15 <- word_topic_vals %>%
                        arrange(desc(value)) %>%
                        group_by(topics) %>%
                        slice(1:15)
word_topic_vals_top15

In [None]:
ggplot(data = word_topic_vals_top15, 
       aes(x = nrow(word_topic_vals_top15):1, y = value)) +
geom_point() +
facet_wrap(~ topics, scales = "free") +
coord_flip() + 
scale_x_continuous(breaks = nrow(word_topic_vals_top15):1,
            labels = word_topic_vals_top15$words)