In [1]:
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(lme4)
library(lmerTest)
library(stringr)


Attaching package: 'dplyr'


The following objects are masked from 'package:plyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: Matrix


Attaching package: 'Matrix'


The following objects are masked from 'package:tidyr':

    expand, pack, unpack



Attaching package: 'lmerTest'


The following object is masked from 'package:lme4':

    lmer


The following object is masked from 'package:stats':

    step




In [2]:
# Load in SPR and surprisal data for a subset

spr <- read.csv("./Fillers.csv")
spr$Sentence <- str_replace_all(spr$Sentence, "%2C", ",")
spr <- spr %>% filter(RT<=7000) %>% rename(participant = MD5)

surps_lstm <- read.csv("../data/gulordava/items_filler.lstm.csv")
surps_gpt2 <- read.csv("../data/gpt2/items_filler.gpt2.csv")
#surps[surps$mean_surprisal == -1,]$mean_surprisal <- NA # 
#surps[surps$sum_surprisal == -1,]$sum_surprisal <- NA # recode NA surprisals as real NAs
surps_lstm$word_pos = surps_lstm$word_pos + 1# adjust to 1-indexing
surps_gpt2$word_pos = surps_gpt2$word_pos + 1# adjust to 1-indexing

# Load in frequencies from the Gulordava Wikipedia corpus
freqs <- read.csv("./freqs.csv")


In [3]:
# merge the two dfs such that we have the relevant surprisal and frequency with each rt

spr$word <- tolower(spr$EachWord)
filler.freqs <- merge(x=spr, y=freqs, by.x="word", by.y="word", all.x=TRUE)

filler.surps <- merge(x=filler.freqs, y=surps_lstm, 
                      by.x=c("item", "WordPosition"), by.y=c("item.here", "word_pos"), all.x=TRUE)
filler.surps$surprisal_lstm <- filler.surps$sum_surprisal # change to avg if that's more appropriate

filler.surps <- merge(x=filler.surps, y=surps_gpt2, 
                      by.x=c("item", "WordPosition"), by.y=c("item.here", "word_pos"), all.x=TRUE)
filler.surps$surprisal_gpt2 <- filler.surps$sumsurprisal # change to avg if that's more appropriate


In [4]:
# Store properties of past words in each row (going back 3 words)
filler.with_lags <- filler.surps %>% group_by_at(vars(item, participant)) %>%
                    mutate(RT_p1 = lag(RT), 
                           RT_p2 = lag(RT_p1), 
                           RT_p3 = lag(RT_p2),
                           length = nchar(EachWord),
                           length_p1 = lag(length), 
                           length_p2 = lag(length_p1),
                           length_p3 = lag(length_p2),
                           logfreq = log(count),
                           logfreq_p1 = lag(logfreq), 
                           logfreq_p2 = lag(logfreq_p1),
                           logfreq_p3 = lag(logfreq_p2),
                           surprisal_lstm_p1 = lag(surprisal_lstm),
                           surprisal_lstm_p2 = lag(surprisal_lstm_p1),
                           surprisal_lstm_p3 = lag(surprisal_lstm_p2),
                           surprisal_gpt2_p1 = lag(surprisal_gpt2),
                           surprisal_gpt2_p2 = lag(surprisal_gpt2_p1),
                           surprisal_gpt2_p3 = lag(surprisal_gpt2_p2)
                  )

In [5]:
x <- filler.surps %>% subset(is.na(surprisal_lstm) & !is.na(surprisal_gpt2) & is.na(count))

x$count

In [6]:
# drop rows with missing data (surprisals for past 3 words and freqs for past 3 words)

filler.drop.lstm <- subset(filler.with_lags, !is.na(surprisal_lstm) & !is.na(surprisal_lstm_p1) & 
                                        !is.na(surprisal_lstm_p2) & !is.na(surprisal_lstm_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

filler.drop.gpt2 <- subset(filler.with_lags, !is.na(surprisal_gpt2) & !is.na(surprisal_gpt2_p1) & 
                                        !is.na(surprisal_gpt2_p2) & !is.na(surprisal_gpt2_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

# print number of remaining rows
print(nrow(filler.with_lags))
print(nrow(filler.drop.lstm))
print(nrow(filler.drop.gpt2))


all_fillers = levels(as.factor(filler.with_lags$item))
print(length(all_fillers))
lstm_fillers = levels(as.factor(filler.drop.lstm$item))
print(length(lstm_fillers))
gpt2_fillers = levels(as.factor(filler.drop.gpt2$item))
print(length(gpt2_fillers))

#items that have been dropped
diff = setdiff(all_fillers, lstm_fillers)
print(diff)

filler.dropped <- subset(filler.with_lags, (is.na(surprisal_lstm) | is.na(surprisal_lstm_p1) | 
                                        is.na(surprisal_lstm_p2) | is.na(surprisal_lstm_p3) |
                                        is.na(logfreq) | is.na(logfreq_p1) |
                                        is.na(logfreq_p2) | is.na(logfreq_p3)) & (item %in% diff))

filler.dropped_diff <- subset(filler.dropped, !is.na(surprisal_gpt2) & !is.na(surprisal_gpt2_p1) & 
                                        !is.na(surprisal_gpt2_p2) & !is.na(surprisal_gpt2_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

filler.dropped_diff


[1] 1403522
[1] 573776
[1] 573776
[1] 40
[1] 30
[1] 30
 [1] "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"


"number of rows of result is not a multiple of vector length (arg 2)"
"number of rows of result is not a multiple of vector length (arg 2)"
"number of rows of result is not a multiple of vector length (arg 2)"
"number of rows of result is not a multiple of vector length (arg 2)"


item,WordPosition,word.x,Time,participant,Type,EachWord,EventTime,Sentence.x,Question.x,...,logfreq,logfreq_p1,logfreq_p2,logfreq_p3,surprisal_lstm_p1,surprisal_lstm_p2,surprisal_lstm_p3,surprisal_gpt2_p1,surprisal_gpt2_p2,surprisal_gpt2_p3
<int>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>


In [7]:
models.filler.lstm <- lmer(data=filler.drop.lstm,
                      RT ~ surprisal_lstm + surprisal_lstm_p1 + surprisal_lstm_p2 + surprisal_lstm_p3 +
                           WordPosition + logfreq*length + logfreq_p1*length_p1 + 
                           logfreq_p2*length_p2 + logfreq_p3*length_p3 + (1 | participant) + (1 | item))
summary(models.filler.lstm) 

saveRDS(models.filler.lstm, "filler_lstm_sum.rds")


Correlation matrix not shown by default, as p = 18 > 12.
Use print(obj, correlation=TRUE)  or
    vcov(obj)        if you need it




Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: RT ~ surprisal_lstm + surprisal_lstm_p1 + surprisal_lstm_p2 +  
    surprisal_lstm_p3 + WordPosition + logfreq * length + logfreq_p1 *  
    length_p1 + logfreq_p2 * length_p2 + logfreq_p3 * length_p3 +  
    (1 | participant) + (1 | item)
   Data: filler.drop.lstm

REML criterion at convergence: 7519392

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-5.048 -0.409 -0.130  0.193 38.473 

Random effects:
 Groups      Name        Variance Std.Dev.
 participant (Intercept)  9841.1   99.20  
 item        (Intercept)   223.4   14.95  
 Residual                28306.4  168.25  
Number of obs: 573776, groups:  participant, 2000; item, 30

Fixed effects:
                       Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)           2.708e+02  7.753e+00  1.730e+03  34.925  < 2e-16 ***
surprisal_lstm        2.407e+00  1.071e-01  5.321e+05  22.469  < 2e-16 ***
surprisal_lstm_p1

In [8]:
models.filler.gpt2 <- lmer(data=filler.drop.gpt2,
                      RT ~ surprisal_gpt2 + surprisal_gpt2_p1 + surprisal_gpt2_p2 + surprisal_gpt2_p3 +
                           WordPosition + logfreq*length + logfreq_p1*length_p1 + 
                           logfreq_p2*length_p2 + logfreq_p3*length_p3 + (1 | participant) + (1 | item))
summary(models.filler.gpt2) 

saveRDS(models.filler.gpt2, "filler_gpt2_sum.rds")


Correlation matrix not shown by default, as p = 18 > 12.
Use print(obj, correlation=TRUE)  or
    vcov(obj)        if you need it




Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: RT ~ surprisal_gpt2 + surprisal_gpt2_p1 + surprisal_gpt2_p2 +  
    surprisal_gpt2_p3 + WordPosition + logfreq * length + logfreq_p1 *  
    length_p1 + logfreq_p2 * length_p2 + logfreq_p3 * length_p3 +  
    (1 | participant) + (1 | item)
   Data: filler.drop.gpt2

REML criterion at convergence: 7519322

Scaled residuals: 
   Min     1Q Median     3Q    Max 
-5.060 -0.409 -0.130  0.192 38.493 

Random effects:
 Groups      Name        Variance Std.Dev.
 participant (Intercept)  9841.2   99.20  
 item        (Intercept)   129.6   11.39  
 Residual                28303.7  168.24  
Number of obs: 573776, groups:  participant, 2000; item, 30

Fixed effects:
                       Estimate Std. Error         df t value Pr(>|t|)    
(Intercept)           3.024e+02  6.968e+00  3.131e+03  43.399  < 2e-16 ***
surprisal_gpt2        1.852e+00  9.924e-02  4.432e+05  18.657  < 2e-16 ***
surprisal_gpt2_p1

In [9]:
# Now that we've fit a model on the fillers, load in and predict on criticals

agree.spr <- read.csv("./AgreementSet.csv")
agree.spr$Sentence <- str_replace_all(agree.spr$Sentence, "%2C", ",")

agree.spr$EachWord <- str_replace_all(agree.spr$EachWord, "%2C", ",")
agree.spr$word.clean <- str_replace_all(agree.spr$EachWord, ",", "")
agree.spr <- agree.spr %>% filter(RT<=7000) %>% rename(participant=MD5)


# Since we're analyzing this one, preprocess a bit
agree.spr$Type[agree.spr$Type == "AGREE"] <- "AGREE_G"
agree.spr <- agree.spr %>% separate(Type, c("Type", "pGram"), sep="_")
agree.spr$pGram[agree.spr$pGram == "UAMB"] <- "G"
agree.spr$pGram[agree.spr$pGram == "AMB"] <- "U"
agree.spr$pGram[agree.spr$pGram == "UNG"] <- "U"

agree.spr$pGram <- as.factor(agree.spr$pGram)
agree.spr$Type <- as.factor(agree.spr$Type)

agree.surps.lstm_a <- read.csv("../data/gulordava/items_agreement.lstm.csv")
agree.surps.lstm_m <- read.csv("../data/gulordava/items_main.lstm.csv")

agree.surps.lstm_a$Type <- "AGREE"
agree.surps.lstm_a$pGram <- recode(agree.surps.lstm_a$Condition, "UNAGREE"="U", "AGREE"="G")
agree.surps.lstm_a$item <- agree.surps.lstm_a$Corresponding.NP.Z.item
agree.surps.lstm_m <- agree.surps.lstm_m %>% separate(condition, c("Type", "pGram"), sep="_")
agree.surps.lstm_m$pGram = recode(agree.surps.lstm_m$ambiguity, "ambiguous"="U", "unambiguous"="G")


com_cols <- intersect(colnames(agree.surps.lstm_a), colnames(agree.surps.lstm_m))
agree.surps.lstm <- rbind(agree.surps.lstm_a[,com_cols], agree.surps.lstm_m[,com_cols])


agree.surps.gpt2_a <- read.csv("../data/gpt2/items_agreement.gpt2.csv")
agree.surps.gpt2_m <- read.csv("../data/gpt2/items_main.gpt2.csv")

agree.surps.gpt2_a$Type <- "AGREE"
agree.surps.gpt2_a$pGram <- recode(agree.surps.gpt2_a$Condition, "UNAGREE"="U", "AGREE"="G")

agree.surps.gpt2_a$item <- agree.surps.gpt2_a$Corresponding.NP.Z.item
agree.surps.gpt2_m <- agree.surps.gpt2_m %>% separate(condition, c("Type", "pGram"), sep="_") 
agree.surps.gpt2_m$pGram = recode(agree.surps.gpt2_m$ambiguity, "ambiguous"="U", "unambiguous"="G")


com_cols <- intersect(colnames(agree.surps.gpt2_a), colnames(agree.surps.gpt2_m))
agree.surps.gpt2 <- rbind(agree.surps.gpt2_a[,com_cols], agree.surps.gpt2_m[,com_cols])

#agree.surps[agree.surps$surprisal == -1,]$surprisal <- NA # recode NA surprisals as real NAs
agree.surps.lstm$word_pos = agree.surps.lstm$word_pos + 1# adjust to 1-indexing
agree.surps.gpt2$word_pos = agree.surps.gpt2$word_pos + 1# adjust to 1-indexing

In [10]:

agree.surps.lstm %>% group_by(item, Type, pGram, word_pos) %>% summarize(count=n(), sents=first(Sentence)) %>% subset(count > 1)

[1m[22m`summarise()` has grouped output by 'item', 'Type', 'pGram'. You can override using the `.groups` argument.


item,Type,pGram,word_pos,count,sents
<int>,<chr>,<chr>,<dbl>,<int>,<chr>


In [11]:
agree.freqs <- merge(x=agree.spr, y=freqs, by.x="word.clean", by.y="word", all.x=TRUE)
agree.surps <- merge(x=agree.freqs, y=agree.surps.lstm, 
                      by.x=c("item", "Type", "pGram", "WordPosition"), by.y=c("item", "Type", "pGram", "word_pos"), all.x=TRUE)
agree.surps$surprisal_lstm <- agree.surps$sum_surprisal

agree.surps <- merge(x=agree.surps, y=agree.surps.gpt2, 
                      by.x=c("item", "Type", "pGram", "WordPosition"), by.y=c("item", "Type", "pGram", "word_pos"), all.x=TRUE)
agree.surps$surprisal_gpt2 <- agree.surps$sumsurprisal

In [12]:
agree.with_lags <-  agree.surps %>% group_by_at(vars(item, participant)) %>%
                    mutate(RT_p1 = lag(RT), 
                           RT_p2 = lag(RT_p1), 
                           RT_p3 = lag(RT_p2),
                           length = nchar(EachWord),
                           length_p1 = lag(length), 
                           length_p2 = lag(length_p1),
                           length_p3 = lag(length_p2),
                           logfreq = log(count),
                           logfreq_p1 = lag(logfreq), 
                           logfreq_p2 = lag(logfreq_p1),
                           logfreq_p3 = lag(logfreq_p2),
                           surprisal_lstm_p1 = lag(surprisal_lstm),
                           surprisal_lstm_p2 = lag(surprisal_lstm_p1),
                           surprisal_lstm_p3 = lag(surprisal_lstm_p2),
                           surprisal_gpt2_p1 = lag(surprisal_gpt2),
                           surprisal_gpt2_p2 = lag(surprisal_gpt2_p1),
                           surprisal_gpt2_p3 = lag(surprisal_gpt2_p2)
                  ) %>% subset(ROI %in% c(0, 1, 2)) %>%
                    mutate(position=droplevels(as.factor(ROI)))


In [13]:
# Code Properly
agree.with_lags$pGram.coded <- recode(agree.with_lags$pGram, "U" = 1, "G" = 0)
agree.with_lags$Type.coded <- recode(agree.with_lags$Type, "AGREE" = 0, "NPZ" = 1)
agree.with_lags$position.coded.1 <- recode(agree.with_lags$position, "0"=0.5, "1"=0, "2"=-0.5)
agree.with_lags$position.coded.2 <- recode(agree.with_lags$position, "0"=0, "1"=0.5, "2"=-0.5)

In [14]:
agree.drop.lstm <- subset(agree.with_lags, !is.na(surprisal_lstm) & !is.na(surprisal_lstm_p1) & 
                                        !is.na(surprisal_lstm_p2) & !is.na(surprisal_lstm_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

agree.drop.gpt2 <- subset(agree.with_lags, !is.na(surprisal_gpt2) & !is.na(surprisal_gpt2_p1) & 
                                        !is.na(surprisal_gpt2_p2) & !is.na(surprisal_gpt2_p3) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))


# print number of remaining rows
print(nrow(agree.with_lags))
print(nrow(agree.drop.lstm))
print(nrow(agree.drop.gpt2))

[1] 92544
[1] 81163
[1] 81163


In [15]:
unique(subset(agree.surps, is.na(count))$word.clean)

nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE")))
nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(surprisal_lstm))))
nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(surprisal_lstm_p1))))
nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(surprisal_lstm_p2))))
nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(surprisal_lstm_p3))))

nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(logfreq))))
nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(logfreq_p1))))
nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(logfreq_p2))))
nrow((agree.with_lags %>% subset(ROI==0 & Type=="AGREE" & is.na(logfreq_p3))))
nrow((agree.drop.lstm %>% subset(ROI==0 & Type=="AGREE")))

In [16]:


agree.drop.lstm$predicted <- predict(models.filler.lstm, newdata=agree.drop.lstm, allow.new.levels = TRUE)
contrasts(agree.drop.lstm$position) <- contr.sum(3)/2
saveRDS(agree.drop.lstm, "agreement_data.lstm.rds")

contrasts(agree.drop.gpt2$position) <- contr.sum(3)/2
agree.drop.gpt2$predicted <- predict(models.filler.gpt2, newdata=agree.drop.gpt2, allow.new.levels = TRUE)
saveRDS(agree.drop.gpt2, "agreement_data.gpt2.rds")


In [17]:
agree.drop.lstm$condition <- as.factor(agree.drop.lstm$Type)

summary(agree.drop.lstm)

      item          Type       pGram      WordPosition   word.clean       
 Min.   : 1.00   AGREE:39823   G:40513   Min.   :7.00   Length:81163      
 1st Qu.: 7.00   NPZ  :41340   U:40650   1st Qu.:8.00   Class :character  
 Median :12.00                           Median :8.00   Mode  :character  
 Mean   :12.49                           Mean   :8.14                     
 3rd Qu.:19.00                           3rd Qu.:9.00                     
 Max.   :24.00                           Max.   :9.00                     
                                                                          
      Time           participant          EachWord           EventTime        
 Min.   :1.627e+09   Length:81163       Length:81163       Min.   :1.630e+12  
 1st Qu.:1.635e+09   Class :character   Class :character   1st Qu.:1.635e+12  
 Median :1.636e+09   Mode  :character   Mode  :character   Median :1.639e+12  
 Mean   :1.636e+09                                         Mean   :1.637e+12  
 3rd 