/
i_have_a_dream.R
156 lines (119 loc) · 5.84 KB
/
i_have_a_dream.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Load raw data, stored at textuploader.com
speech.raw <- paste(scan(url("http://textuploader.com/1k0g/raw"),
what="character"), collapse=" ")
library(wordcloud)
wordcloud(speech.raw) # Also takes other arguments like color
library(qdap)
library(data.table)
# Split into sentences
# qdap's sentSplit is modeled after dialogue data, so person field is needed
speech.df <- data.table(speech=speech.raw, person="MLK")
sentences <- data.table(sentSplit(speech.df, "speech"))
# Add a sentence counter and remove unnecessary variables
sentences[, sentence.num := seq(nrow(sentences))]
sentences[, person := NULL]
sentences[, tot := NULL]
setcolorder(sentences, c("sentence.num", "speech"))
# Syllables per sentence
sentences[, syllables := syllable.sum(speech)]
# Add cumulative syllable count and percent complete as proxy for progression
sentences[, syllables.cumsum := cumsum(syllables)]
sentences[, pct.complete := syllables.cumsum / sum(sentences$syllables)]
sentences[, pct.complete.100 := pct.complete * 100]
# Sentiment and word count
pol.df <- polarity(sentences$speech)$all
sentences[, words := pol.df$wc]
sentences[, pol := pol.df$polarity]
# Basic plot of sentiment over time
with(sentences, plot(pct.complete, pol))
# Better plots with ggplot2
library(ggplot2)
library(scales)
my.theme <-
theme(plot.background = element_blank(), # Remove background
panel.grid.major = element_blank(), # Remove gridlines
panel.grid.minor = element_blank(), # Remove more gridlines
panel.border = element_blank(), # Remove border
panel.background = element_blank(), # Remove more background
axis.ticks = element_blank(), # Remove axis ticks
axis.text=element_text(size=14), # Enlarge axis text font
axis.title=element_text(size=16), # Enlarge axis title font
plot.title=element_text(size=24, hjust=0)) # Enlarge, left-align title
CustomScatterPlot <- function(gg)
return(gg + geom_point(color="grey60") + # Lighten dots
stat_smooth(color="royalblue", fill="lightgray", size=1.4) +
xlab("Percent complete (by syllable count)") +
scale_x_continuous(labels = percent) + my.theme)
CustomScatterPlot(ggplot(sentences, aes(pct.complete, pol)) +
ylab("Sentiment (sentence-level polarity)") +
ggtitle("Sentiment of I Have a Dream speech"))
# Readability
sentences[, readability := automated_readability_index(speech, sentence.num)
$Automated_Readability_Index]
CustomScatterPlot(ggplot(sentences, aes(pct.complete, readability)) +
ylab("Automated Readability Index") +
ggtitle("Readability of I Have a Dream speech"))
# Adapted from theBioBucket at http://goo.gl/TXvTxP
GoogleHits <- function(query){
require(XML)
require(RCurl)
url <- paste0("https://www.google.com/search?q=", gsub(" ", "+", query))
CAINFO = paste0(system.file(package="RCurl"), "/CurlSSL/ca-bundle.crt")
script <- getURL(url, followlocation=T, cainfo=CAINFO)
doc <- htmlParse(script)
# Results look like this:
# <div class="sd" id="resultStats">About 10,300,000 results</div>
res <- xpathSApply(doc, '//*/div[@id="resultStats"]', xmlValue)
return(as.numeric(gsub("[^0-9]", "", res)))
}
# Calculate Google search results for each sentence
# Group full sentence using brackets and then add "mlk"
sentences[, google.hits := GoogleHits(paste0("[", gsub("[,;!.]", "", speech),
"] mlk"))]
# Graph Google hits throughout speech
ggplot(sentences, aes(pct.complete, google.hits / 1e6)) +
geom_line(color="grey40") + # Lighten dots
xlab("Percent complete (by syllable count)") +
scale_x_continuous(labels = percent) + my.theme +
ylim(0, max(sentences$google.hits) / 1e6) +
ylab("Sentence memorability (millions of Google hits)") +
ggtitle("Memorability of I Have a Dream speech")
# Examine top 7 sentences
head(sentences[order(-google.hits)]$speech, 7)
# Plot log of hits to reduce outliers
sentences[, log.google.hits := log(google.hits)]
CustomScatterPlot(ggplot(sentences, aes(pct.complete, log.google.hits)) +
ylab("Memorability (log of sentence's Google hits)") +
ggtitle("Memorability of I Have a Dream speech"))
# Linear model
library(MASS)
google.lm <- stepAIC(lm(log(google.hits) ~ poly(readability, 3) + pol +
pct.complete.100, data=sentences))
# Check out the coefficients and R-sq
summary(google.lm)
# Exponentiate coefficients and print as percentage since outcome was logged
exp(google.lm$coefficients["pct.complete.100"])
new.data <- data.frame(readability=seq(min(sentences$readability),
max(sentences$readability), by=0.1),
pct.complete.100=mean(sentences$pct.complete.100))
new.data$pred.hits <- predict(google.lm, newdata=new.data)
ggplot(new.data, aes(readability, pred.hits)) +
geom_line(color="royalblue", size=1.4) +
xlab("Automated Readability Index") +
ylab("Predicted memorability (log Google hits)") +
ggtitle("Predicted memorability ~ readability") +
my.theme
exp.coef <- sprintf("%1.1f%%", (exp(google.lm$coefficients) - 1) * 100)
names(exp.coef) <- names(google.lm$coefficients)
exp.coef
# APPENDIX
# Formality scores are a bit unreliable, e.g. a couple sentences randomly got 0
formality <-
data.table(formality(sentences$speech, sentences$sentence.num)$formality)
sentences[, formality := formality[order(sentence.num)]$formality]
sentences[, formality.smooth := loess(formality ~ syllables.cumsum)$fitted]
# Smooth the trends via loess
sentences[, syllables.smooth := loess(syllables ~ syllables.cumsum)$fitted]
sentences[, pol.smooth := loess(pol ~ syllables.cumsum)$fitted]
sentences[, words.smooth := loess(words ~ syllables.cumsum)$fitted]
sentences[, readability.smooth := loess(readability ~ syllables.cumsum)$fitted]