In [1]:
library(ggplot2)
library(plyr)
library(dplyr)
library(tidyr)
library(lme4)
library(lmerTest)
library(stringr)


Attaching package: 'dplyr'


The following objects are masked from 'package:plyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Loading required package: Matrix


Attaching package: 'Matrix'


The following objects are masked from 'package:tidyr':

    expand, pack, unpack



Attaching package: 'lmerTest'


The following object is masked from 'package:lme4':

    lmer


The following object is masked from 'package:stats':

    step




In [2]:
# Load in SPR and surprisal data for a subset

spr <- read.csv("./Fillers.csv")
spr$Sentence <- str_replace_all(spr$Sentence, "%2C", ",")
spr <- spr %>% filter(RT<=7000) %>% rename(participant = MD5)

surps_lstm <- read.csv("../data/gulordava/items_filler.lstm.csv")
surps_gpt2 <- read.csv("../data/gpt2/items_filler.gpt2.csv")
#surps[surps$mean_surprisal == -1,]$mean_surprisal <- NA # 
#surps[surps$sum_surprisal == -1,]$sum_surprisal <- NA # recode NA surprisals as real NAs
surps_lstm$word_pos = surps_lstm$word_pos + 1# adjust to 1-indexing
surps_gpt2$word_pos = surps_gpt2$word_pos + 1# adjust to 1-indexing

# Load in frequencies from the Gulordava Wikipedia corpus
freqs <- read.csv("./freqs.csv")


In [3]:
# merge the two dfs such that we have the relevant surprisal and frequency with each rt

spr$word <- tolower(spr$EachWord)
filler.freqs <- merge(x=spr, y=freqs, by.x="word", by.y="word", all.x=TRUE)

filler.surps <- merge(x=filler.freqs, y=surps_lstm, 
                      by.x=c("item", "WordPosition"), by.y=c("item.here", "word_pos"), all.x=TRUE)
filler.surps$surprisal_lstm = filler.surps$sum_surprisal # change to avg if that's more appropriate

filler.surps <- merge(x=filler.surps, y=surps_gpt2, 
                      by.x=c("item", "WordPosition"), by.y=c("item.here", "word_pos"), all.x=TRUE)
filler.surps$surprisal_gpt2 = filler.surps$sumsurprisal # change to avg if that's more appropriate


In [4]:
# Store properties of past words in each row (going back 3 words)
filler.with_lags <- filler.surps %>% group_by_at(vars(item, participant)) %>%
                    mutate(RT_p1 = lag(RT), 
                           RT_p2 = lag(RT_p1), 
                           RT_p3 = lag(RT_p2),
                           length = nchar(EachWord),
                           length_p1 = lag(length), 
                           length_p2 = lag(length_p1),
                           length_p3 = lag(length_p2),
                           logfreq = log(count),
                           logfreq_p1 = lag(logfreq), 
                           logfreq_p2 = lag(logfreq_p1),
                           logfreq_p3 = lag(logfreq_p2),
                           surprisal_lstm_p1 = lag(surprisal_lstm),
                           surprisal_lstm_p2 = lag(surprisal_lstm_p1),
                           surprisal_lstm_p3 = lag(surprisal_lstm_p2),
                           surprisal_gpt2_p1 = lag(surprisal_gpt2),
                           surprisal_gpt2_p2 = lag(surprisal_gpt2_p1),
                           surprisal_gpt2_p3 = lag(surprisal_gpt2_p2)
                  )

In [7]:
# drop rows with missing data (surprisals for past 3 words and freqs for past 3 words)

filler.drop.lstm <- subset(filler.with_lags, !is.na(surprisal_lstm) & !is.na(surprisal_lstm_p1) & 
                                        !is.na(surprisal_lstm_p2) & !is.na(surprisal_lstm_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

filler.drop.gpt2 <- subset(filler.with_lags, !is.na(surprisal_gpt2) & !is.na(surprisal_gpt2_p1) & 
                                        !is.na(surprisal_gpt2_p2) & !is.na(surprisal_gpt2_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

# print number of remaining rows
print(nrow(filler.with_lags))
print(nrow(filler.drop.lstm))
print(nrow(filler.drop.gpt2))


all_fillers = levels(as.factor(filler.with_lags$item))
print(length(all_fillers))
lstm_fillers = levels(as.factor(filler.drop.lstm$item))
print(length(lstm_fillers))
gpt2_fillers = levels(as.factor(filler.drop.gpt2$item))
print(length(gpt2_fillers))

#items that have been dropped
diff = setdiff(all_fillers, lstm_fillers)
print(diff)

filler.dropped <- subset(filler.with_lags, (is.na(surprisal_lstm) | is.na(surprisal_lstm_p1) | 
                                        is.na(surprisal_lstm_p2) | is.na(surprisal_lstm_p3) |
                                        is.na(logfreq) | is.na(logfreq_p1) |
                                        is.na(logfreq_p2) | is.na(logfreq_p3)) & (item %in% diff)) %>%
                  group_by(item, WordPosition) %>%
                  summarize(word = first(word),
                            sent = first(Sentence.x),
                           logfreq = first(logfreq),
                           logfreq_p1 = first(logfreq_p1), 
                           logfreq_p2 = first(logfreq_p2),
                           logfreq_p3 = first(logfreq_p3),
                           surprisal_lstm = first(surprisal_lstm),
                           surprisal_lstm_p1 = first(surprisal_lstm_p1),
                           surprisal_lstm_p2 = first(surprisal_lstm_p2),
                           surprisal_lstm_p3 = first(surprisal_lstm_p3))

filler.dropped


[1] 1403522
[1] 573776
[1] 573776
[1] 40
[1] 30
[1] 30
 [1] "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"


[1m[22m`summarise()` has grouped output by 'item'. You can override using the `.groups` argument.


item,WordPosition,word,sent,logfreq,logfreq_p1,logfreq_p2,logfreq_p3,surprisal_lstm,surprisal_lstm_p1,surprisal_lstm_p2,surprisal_lstm_p3
<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
103,1,,"For centuries, time was measured by the position of the sun with the use of sundials.",13.287778,,,,,,,
103,2,,"For centuries, time was measured by the position of the sun with the use of sundials.",,13.287778,,,,,,
103,3,,"For centuries, time was measured by the position of the sun with the use of sundials.",11.520854,,13.287778,,,,,
103,4,,"For centuries, time was measured by the position of the sun with the use of sundials.",13.626671,11.520854,,13.287778,,,,
103,5,,"For centuries, time was measured by the position of the sun with the use of sundials.",7.762596,13.626671,11.520854,,,,,
103,6,,"For centuries, time was measured by the position of the sun with the use of sundials.",13.107089,7.762596,13.626671,11.520854,,,,
103,7,,"For centuries, time was measured by the position of the sun with the use of sundials.",15.469250,13.107089,7.762596,13.626671,,,,
103,8,,"For centuries, time was measured by the position of the sun with the use of sundials.",9.797571,15.469250,13.107089,7.762596,,,,
103,9,,"For centuries, time was measured by the position of the sun with the use of sundials.",14.744420,9.797571,15.469250,13.107089,,,,
103,10,,"For centuries, time was measured by the position of the sun with the use of sundials.",15.469250,14.744420,9.797571,15.469250,,,,


In [6]:
models.filler.lstm <- lmer(data=filler.drop.lstm,
                      RT ~ surprisal_lstm + surprisal_lstm_p1 + surprisal_lstm_p2 + surprisal_lstm_p3 +
                           WordPosition + logfreq*length + logfreq_p1*length_p1 + 
                           logfreq_p2*length_p2 + logfreq_p3*length_p3 + (1 | participant) + (1 | item))
summary(models.filler.lstm) 

saveRDS(models.filler.lstm, "filler_lstm_sum.rds")

In [None]:
models.filler.gpt2 <- lmer(data=filler.drop.gpt2,
                      RT ~ surprisal_gpt2 + surprisal_gpt2_p1 + surprisal_gpt2_p2 + surprisal_gpt2_p3 +
                           WordPosition + logfreq*length + logfreq_p1*length_p1 + 
                           logfreq_p2*length_p2 + logfreq_p3*length_p3 + (1 | participant) + (1 | item))
summary(models.filler.gpt2) 

saveRDS(models.filler.gpt2, "filler_gpt2_sum.rds")

In [None]:
# Now that we've fit a model on the fillers, load in and predict on criticals

agree.spr <- read.csv("./AgreementSet.csv")
agree.spr$Sentence <- str_replace_all(agree.spr$Sentence, "%2C", ",")
agree.spr <- agree.spr %>% filter(RT<=7000) %>% rename(participant=MD5)


# Since we're analyzing this one, preprocess a bit
agree.spr$Type[agree.spr$Type == "AGREE"] <- "AGREE_G"
agree.spr <- agree.spr %>% separate(Type, c("Type", "pGram"), sep="_")
agree.spr$pGram[agree.spr$pGram == "UAMB"] <- "G"
agree.spr$pGram[agree.spr$pGram == "AMB"] <- "U"
agree.spr$pGram[agree.spr$pGram == "UNG"] <- "U"

agree.spr$pGram <- as.factor(agree.spr$pGram)
agree.spr$Type <- as.factor(agree.spr$Type)

agree.surps.lstm_a <- read.csv("../data/gulordava/items_agreement.lstm.csv")
agree.surps.lstm_m <- read.csv("../data/gulordava/items_main.lstm.csv")
com_cols <- intersect(colnames(agree.surps.lstm_a), colnames(agree.surps.lstm_m))
agree.surps.lstm <- rbind(agree.surps.lstm_a[,com_cols], agree.surps.lstm_m[,com_cols])


agree.surps.gpt2_a <- read.csv("../data/gpt2/items_agreement.gpt2.csv")
agree.surps.gpt2_m <- read.csv("../data/gpt2/items_main.gpt2.csv")
com_cols <- intersect(colnames(agree.surps.gpt2_a), colnames(agree.surps.gpt2_m))
agree.surps.gpt2 <- rbind(agree.surps.gpt2_a[,com_cols], agree.surps.gpt2_m[,com_cols])

#agree.surps[agree.surps$surprisal == -1,]$surprisal <- NA # recode NA surprisals as real NAs
agree.surps.lstm$word_pos = agree.surps.lstm$word_pos + 1# adjust to 1-indexing
agree.surps.gpt2$word_pos = agree.surps.gpt2$word_pos + 1# adjust to 1-indexing

In [None]:
agree.freqs <- merge(x=agree.spr, y=freqs, by.x="EachWord", by.y="word", all.x=TRUE)
agree.surps <- merge(x=agree.freqs, y=agree.surps.lstm, 
                      by.x=c("item", "WordPosition"), by.y=c("item.here", "word_pos"), all.x=TRUE)
agree.surps$surprisal_lstm <- agree.surps$sum_surprisal

agree.surps <- merge(x=agree.surps, y=agree.surps.gpt2, 
                      by.x=c("item", "WordPosition"), by.y=c("item.here", "word_pos"), all.x=TRUE)
agree.surps$surprisal_gpt2 <- agree.surps$sumsurprisal

In [None]:
agree.with_lags <-  agree.surps %>% group_by_at(vars(item, participant)) %>%
                    mutate(RT_p1 = lag(RT), 
                           RT_p2 = lag(RT_p1), 
                           RT_p3 = lag(RT_p2),
                           length = nchar(EachWord),
                           length_p1 = lag(length), 
                           length_p2 = lag(length_p1),
                           length_p3 = lag(length_p2),
                           logfreq = log(count),
                           logfreq_p1 = lag(logfreq), 
                           logfreq_p2 = lag(logfreq_p1),
                           logfreq_p3 = lag(logfreq_p2),
                           surprisal_lstm_p1 = lag(surprisal_lstm),
                           surprisal_lstm_p2 = lag(surprisal_lstm_p1),
                           surprisal_lstm_p3 = lag(surprisal_lstm_p2),
                           surprisal_gpt2_p1 = lag(surprisal_gpt2),
                           surprisal_gpt2_p2 = lag(surprisal_gpt2_p1),
                           surprisal_gpt2_p3 = lag(surprisal_gpt2_p2)
                  ) %>% subset(ROI %in% c(0, 1, 2)) %>%
                    mutate(position=droplevels(as.factor(ROI)))


In [None]:
agree.drop.lstm <- subset(agree.with_lags, !is.na(surprisal_lstm) & !is.na(surprisal_lstm_p1) & 
                                        !is.na(surprisal_lstm_p2) & !is.na(surprisal_lstm_p3) &
                                        !is.na(logfreq) & !is.na(logfreq_p1) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))

agree.drop.gpt2 <- subset(agree.with_lags, !is.na(surprisal_gpt2) & !is.na(surprisal_gpt2_p1) & 
                                        !is.na(surprisal_gpt2_p2) & !is.na(surprisal_gpt2_p3) &
                                        !is.na(logfreq_p2) & !is.na(logfreq_p3))


# print number of remaining rows
print(nrow(agree.with_lags))
print(nrow(agree.drop.lstm))
print(nrow(agree.drop.gpt2))

In [None]:

agree.drop.lstm$predicted <- predict(models.filler.lstm, newdata=agree.drop.lstm, allow.new.levels = TRUE)
contrasts(agree.drop.lstm$position) <- contr.sum(3)/2
saveRDS(agree.drop.lstm, "agreement_data.lstm.rds")

contrasts(agree.drop.gpt2$position) <- contr.sum(3)/2
agree.drop.gpt2$predicted <- predict(models.filler.gpt2, newdata=agree.drop.gpt2, allow.new.levels = TRUE)
saveRDS(agree.drop.gpt2, "agreement_data.gpt2.rds")


In [None]:
agree.drop.lstm$condition <- as.factor(agree.drop.lstm$Type)

summary(agree.drop.lstm)