In [90]:
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(lme4)
library(lmerTest)
library(stringr)


Attaching package: 'lmerTest'


The following object is masked from 'package:lme4':

    lmer


The following object is masked from 'package:stats':

    step




In [76]:
# Load in SPR and surprisal data for a subset

spr <- read.csv("./Fillers.csv")
spr$Sentence <- str_replace_all(spr$Sentence, "%2C", ",")
spr <- spr %>% filter(RT<=7000) 

surps <- read.csv("../data/gulordava/items_filler.lstm.csv")
surps[surps$surprisal == -1,]$surprisal <- NA # recode NA surprisals as real NAs
surps$word_pos = surps$word_pos + 1# adjust to 1-indexing


# Load in frequencies from the Gulordava Wikipedia corpus
freqs <- read.csv("./freqs.csv")


In [78]:
# merge the two dfs such that we have the relevant surprisal and frequency with each rt

merged.freqs <- merge(x=spr, y=freqs, by.x="EachWord", by.y="word", all.x=TRUE)
merged.surps <- merge(x=merged.freqs, y=surps, 
                      by.x=c("Sentence", "WordPosition"), by.y=c("Sentence", "word_pos"), all.x=TRUE)

In [80]:
# Store properties of past words in each row (going back 3 words)

merged.with_lags <- merged.surps %>% group_by_at(vars(Sentence, MD5)) %>%
                    mutate(RT_p1 = lag(RT), 
                           RT_p2 = lag(RT_p1), 
                           RT_p3 = lag(RT_p2),
                           length = nchar(EachWord),
                           length_p1 = lag(length), 
                           length_p2 = lag(length_p1),
                           length_p3 = lag(length_p2),
                           logfreq = log(count),
                           logfreq_p1 = lag(logfreq), 
                           logfreq_p2 = lag(logfreq_p1),
                           logfreq_p3 = lag(logfreq_p2),
                           surprisal_p1 = lag(surprisal),
                           surprisal_p2 = lag(surprisal_p1),
                           surprisal_p3 = lag(surprisal_p2)
                  )

In [88]:
# drop rows with missing data (surprisals for past 3 words and freqs for past 3 words)

merged.drop <- subset(merged.with_lags, !is.na(surprisal) & !is.na(surprisal_p1) & 
                                        !is.na(surprisal_p2) & !is.na(surprisal_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

# print number of remaining rows
print(nrow(merged.with_lags))
print(nrow(merged.drop))


[1] 1403522
[1] 435894


Sentence,WordPosition,EachWord,Time,MD5,Type,EventTime,Question.x,Answer,List,...,length_p1,length_p2,length_p3,logfreq,logfreq_p1,logfreq_p2,logfreq_p3,surprisal_p1,surprisal_p2,surprisal_p3
<chr>,<int>,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>,<int>,<chr>,...,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1636487256,b18be91fab6e92d9d875b7213d85209d,FILLER1,1.64000e+12,Who hates the bill most?,0,r,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1634670345,f77b5899bb42378ae26a8c62fd778f72,FILLER1,1.63467e+12,Who hates the bill most?,0,e,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1630697959,c64b82c52eb7e645a8824134c223f79f,FILLER1,1.63000e+12,Who hates the bill most?,0,c,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1636134055,2448f7b9f1ce5b9de3810508e91b8d2c,FILLER1,1.64000e+12,Who hates the bill most?,0,o,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1639594360,af8a7829842816888d78c2b31756127b,FILLER1,1.63959e+12,Who hates the bill most?,0,c,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1627335989,2db95cd08dc86e0140289cea47fddf9d,FILLER1,1.63000e+12,Who hates the bill most?,0,h,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1635790232,c4259766948a9bbcb471938830c3ba35,FILLER1,1.63579e+12,Who hates the bill most?,0,p,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1639428323,4901b5398231a6f322d5ef1448a5ef34,FILLER1,1.63943e+12,Who hates the bill most?,0,l,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1639083236,5c7412c111666bacff5dbd66075024eb,FILLER1,1.63908e+12,Who hates the bill most?,0,n,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388
"A bill was drafted and introduced into Parliament several times but met with great opposition, mostly from farmers.",5,and,1639593201,4d4c228670ade561525af34b4b00bd6f,FILLER1,1.63959e+12,Who hates the bill most?,0,d,...,7,3,4,14.5461,7.189168,13.62667,9.212737,4.788502,3.049126,9.728388


In [99]:
models.filler <- lmer(data=merged.drop,
                      RT ~ surprisal + surprisal_p1 + surprisal_p2 + surprisal_p3 +
                           WordPosition + logfreq*length + logfreq_p1*length_p1 + 
                           logfreq_p2*length_p2 + logfreq_p3*length_p3 + (1 | MD5))
summary(models.filler) 

saveRDS(models.filler, "filler_lm.rds")


Correlation matrix not shown by default, as p = 18 > 12.
Use print(obj, correlation=TRUE)  or
    vcov(obj)        if you need it




Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: RT ~ surprisal + surprisal_p1 + surprisal_p2 + surprisal_p3 +  
    WordPosition + logfreq * length + logfreq_p1 * length_p1 +  
    logfreq_p2 * length_p2 + logfreq_p3 * length_p3 + (1 | MD5)
   Data: merged.drop

REML criterion at convergence: 5674196

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-5.231 -0.411 -0.129  0.196 40.341 

Random effects:
 Groups   Name        Variance Std.Dev.
 MD5      (Intercept)  9435     97.13  
 Residual             25830    160.72  
Number of obs: 435894, groups:  MD5, 2000

Fixed effects:
                       Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)           2.279e+02  7.690e+00  1.919e+05  29.633  < 2e-16 ***
surprisal             2.732e+00  1.104e-01  4.339e+05  24.758  < 2e-16 ***
surprisal_p1          3.356e+00  1.127e-01  4.339e+05  29.771  < 2e-16 ***
surprisal_p2          2.001e+00  1.098e-01  4.339e+05  18.223  <

In [94]:
# Now that we've fit a model on the fillers, load in and predict on criticals

agree.spr <- read.csv("./AgreementSet.csv")
agree.spr$Sentence <- str_replace_all(agree.spr$Sentence, "%2C", ",")
agree.spr <- agree.spr %>% filter(RT<=7000) 

agree.surps <- read.csv("../data/gulordava/items_agreement.lstm.csv")
agree.surps[agree.surps$surprisal == -1,]$surprisal <- NA # recode NA surprisals as real NAs
agree.surps$word_pos = agree.surps$word_pos + 1# adjust to 1-indexing


In [95]:
agree.freqs <- merge(x=agree.spr, y=freqs, by.x="EachWord", by.y="word", all.x=TRUE)
agree.surps <- merge(x=agree.freqs, y=agree.surps, 
                      by.x=c("Sentence", "WordPosition"), by.y=c("Sentence", "word_pos"), all.x=TRUE)


In [96]:
agree.with_lags <-  agree.surps %>% group_by_at(vars(Sentence, MD5)) %>%
                    mutate(RT_p1 = lag(RT), 
                           RT_p2 = lag(RT_p1), 
                           RT_p3 = lag(RT_p2),
                           length = nchar(EachWord),
                           length_p1 = lag(length), 
                           length_p2 = lag(length_p1),
                           length_p3 = lag(length_p2),
                           logfreq = log(count),
                           logfreq_p1 = lag(logfreq), 
                           logfreq_p2 = lag(logfreq_p1),
                           logfreq_p3 = lag(logfreq_p2),
                           surprisal_p1 = lag(surprisal),
                           surprisal_p2 = lag(surprisal_p1),
                           surprisal_p3 = lag(surprisal_p2)
                  )

In [98]:
agree.drop <- subset(agree.with_lags, !is.na(surprisal) & !is.na(surprisal_p1) & 
                                        !is.na(surprisal_p2) & !is.na(surprisal_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

# print number of remaining rows
print(nrow(agree.with_lags))
print(nrow(agree.drop))

[1] 420676
[1] 76829


In [100]:
agree.drop$predicted <- predict(models.filler, newdata=agree.drop)

In [101]:
head(agree.drop)

Sentence,WordPosition,EachWord,Time,MD5,Type,EventTime,Question.x,Answer,List,...,length_p2,length_p3,logfreq,logfreq_p1,logfreq_p2,logfreq_p3,surprisal_p1,surprisal_p2,surprisal_p3,predicted
<chr>,<int>,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>,<int>,<chr>,...,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
"After the diplomat signs, the agreement creates another border conflict as a side effect.",8,another,1639090449,23bb47ffc97720f565f97e9644e95124,AGREE,1639090000000.0,Does the agreement lead to peace?,0,f,...,9,3,10.4001,7.354362,8.817594,15.46925,8.870157,8.932127,1.84974,564.4141
"After the diplomat signs, the agreement creates another border conflict as a side effect.",8,another,1635868863,d29c1f4fdda6be958fa6edad3a6241b1,AGREE,1640000000000.0,Does the agreement lead to peace?,0,f,...,9,3,10.4001,7.354362,8.817594,15.46925,8.870157,8.932127,1.84974,406.8969
"After the diplomat signs, the agreement creates another border conflict as a side effect.",8,another,1630705356,e0577fbd716c919f278f03cd8528d783,AGREE,1630000000000.0,Does the agreement lead to peace?,0,f,...,9,3,10.4001,7.354362,8.817594,15.46925,8.870157,8.932127,1.84974,372.2721
"After the diplomat signs, the agreement creates another border conflict as a side effect.",8,another,1639089322,1747d390d6731a7cac89f160cbecd085,AGREE,1639090000000.0,Does the agreement lead to peace?,0,n,...,9,3,10.4001,7.354362,8.817594,15.46925,8.870157,8.932127,1.84974,367.1755
"After the diplomat signs, the agreement creates another border conflict as a side effect.",8,another,1635189532,1f3195181d005445cbb2a00051dde2d2,AGREE,1635190000000.0,Does the agreement lead to peace?,0,f,...,9,3,10.4001,7.354362,8.817594,15.46925,8.870157,8.932127,1.84974,337.1942
"After the diplomat signs, the agreement creates another border conflict as a side effect.",8,another,1636482344,af5d178c58828a26f8823f76b82e343e,AGREE,1640000000000.0,Does the agreement lead to peace?,0,p,...,9,3,10.4001,7.354362,8.817594,15.46925,8.870157,8.932127,1.84974,420.6281
