Permalink
Fetching contributors…
Cannot retrieve contributors at this time
40 lines (32 sloc) 851 Bytes
library(dplyr)
library(purrr)
library(rvest)
library(stringr)
scrape_transcript <- function(i) {
url <- sprintf("http://www.thisamericanlife.org/radio-archives/episode/%s/transcript", i)
page <- read_html(url)
transcript <- html_nodes(page, ".act p")
meta <- data_frame(
episode = i,
title =
html_node(page, "#content h2 a") %>%
html_text(),
date =
html_node(page, ".radio-date") %>%
html_text() %>%
str_extract("\\d{2}\\.\\d{2}\\.\\d{4}$") %>%
as.Date("%m.%d.%Y"),
k = 1
)
lines <- data_frame(
time = html_attr(transcript, "begin"),
line = html_text(transcript),
k = 1
)
left_join(meta, lines, by = "k") %>%
select(-k)
}
# 591 episodes
# Episode 375 returns forbidden code
tal <- map_df(1:591, scrape_transcript)
saveRDS(tal, "tal-transcripts.rds")