-
Notifications
You must be signed in to change notification settings - Fork 0
/
05-get-corpus.r
112 lines (70 loc) · 2.37 KB
/
05-get-corpus.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#
# 5. get a structured corpus for the network
#
library(dplyr)
library(readr)
library(rvest)
library(ggplot2)
library(network)
library(stringr)
library(SnowballC) # stemming
library(tm) # cleaning
e = read_csv("data/edges.csv")
e = unique(c(e$i, e$j))
e = gsub("(.*)http(.*)", "http\\2", e)
f = gsub("http://(.*)\\.hypotheses\\.org/(.*)", "html/\\1.\\2.html", e)
k = e[ !file.exists(f) ]
# allow for a few permanent errors
while(length(k) > 100) {
cat("Downloading", sprintf("%4.0f", length(k)), "articles\n")
for(j in sample(k, ifelse(length(k) > 500, 500, length(k)))) {
fn = gsub("http://(.*)\\.hypotheses\\.org/(.*)", "html/\\1.\\2.html", j)
if(!file.exists(fn))
try(download.file(j, fn, quiet = TRUE), silent = TRUE)
if(!file.exists(fn) | !file.info(fn)$size)
cat("failed to download", j, "\n")
# be nice with Hypothèses
Sys.sleep(1)
}
# remove empty files
null = !file.info(list.files("html", full.names = TRUE))$size
null = file.remove(list.files("html", full.names = TRUE)[ null ])
if(length(null))
cat("Removed", length(null), "empty files\n")
k = e[ !file.exists(f) ]
}
load("model/networks.rda")
text = data_frame()
for(n in l) {
cat("Building corpus for", n %n% "year")
for(j in unique(n %v% "oc")) {
# files for all nodes (both senders and receivers)
ff = n %n% "files"
# subset to the optimal community under scrutiny
oc = network.vertex.names(n)[ n %v% "oc" == j ]
oc = paste0(oc, collapse = "|")
ff = ff[ grepl(paste0("^html/(", oc, ")"), ff) ]
tags = c()
for(i in ff[ file.exists(ff) ]) {
h = html(i)
t = html_nodes(h, xpath = "//meta[@property = 'dc:subject']") %>%
html_attr("content")
if(length(t) > 0)
tags = c(tags, tolower(t))
}
# remove prefix numbers
tags = gsub("^\\d+(-|\\.)? ", "", tags)
# remove punctuation
tags = gsub("[[:punct:]]", " ", tags)
# trim whitespace
tags = str_trim(gsub("\\s+", " ", tags))
# keyword exceptions
tags = tags[ !tags %in% c("", "non classé") ]
if(length(tags))
text = rbind(text, data_frame(year = n %n% "year", oc = j,
text = paste0(tags, collapse = " . ")))
}
cat(":", sum(text$year == n %n% "year"), "documents\n")
}
text$oc = paste0("i", sprintf("%03.0f", 1:nrow(text)))
write_csv(text, "data/corpus.csv")