In [1]:
install.packages(c("tidyverse", "readtext", 
    "quanteda", "quanteda.textmodels", 
    "topicmodels", "keras", "topicdoc"))

In [8]:
# General packages and dictionary analysis
library(glue)
library(tidyverse)
library(readtext)
library(quanteda)

# Supervised text classification
library(quanteda.textmodels)
library(MLmetrics)

# Topic Modeling
library(topicmodels)
library(topicdoc)

In [3]:
filename = "reviewdata.rds"
if (file.exists(filename)) {
    print("Using cached data")
    reviewdata= readRDS(filename)
} else {
    print("Downloading data")
    fn = "aclImdb_v1.tar.gz"
    url = glue("https://cssbook.net/d/{fn}")
    download.file(url, fn)
    untar(fn)
    reviewdata = readtext(
      file.path("aclImdb", "*", "*", "*.txt"), 
      docvarsfrom = "filepaths", dvsep="[/\\]",
      docvarnames=c("i","dataset","label","fn"))
    unlink(c("aclImdb", fn), recursive=TRUE)
    reviewdata = reviewdata %>% 
      filter(label %in% c("pos", "neg")) %>% 
      select(-i) %>% 
      corpus()
    saveRDS(reviewdata, filename)
}
head(docvars(reviewdata))

[1] "Using cached data"


Unnamed: 0_level_0,dataset,label,fn
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,test,neg,0_2.txt
2,test,neg,1_3.txt
3,test,neg,10_3.txt
4,test,neg,100_4.txt
5,test,neg,1000_3.txt
6,test,neg,10000_4.txt


In [6]:
poswords = "https://cssbook.net/d/positive.txt"
negwords = "https://cssbook.net/d/negative.txt"
pos = scan(poswords, what="list")
neg = scan(negwords, what="list")
sentimentdict = dictionary(list(pos=pos, neg=neg))


# For speed, we only take the first 100 reviews
scores = corpus_sample(reviewdata, 100)  %>% 
  tokens() %>%
  dfm() %>% 
  dfm_lookup(sentimentdict) %>% 
  convert(to="data.frame")  %>% 
  mutate(sent = pos - neg)
head(scores)

Unnamed: 0_level_0,doc_id,pos,neg,sent
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>
1,test/neg/11772_2.txt/11772_2.txt,12,4,8
2,test/neg/4821_2.txt/4821_2.txt,7,5,2
3,test/pos/8473_9.txt/8473_9.txt,14,9,5
4,test/pos/159_10.txt/159_10.txt,20,12,8
5,test/neg/125_3.txt/125_3.txt,3,7,-4
6,test/neg/8146_4.txt/8146_4.txt,8,5,3
