/
Sakefile
119 lines (116 loc) · 4.39 KB
/
Sakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
---
generate data:
help: load and process all data from corpora
select articles:
help: find all NGO mentions in the corpora
dependencies:
- ./Data/ngos.csv
- ./Corpora/egypt_independent.db
- ./Corpora/ahram.db
- ./Corpora/dne.db
formula: >
python3 Python/select_articles.py
output:
- ./Data/ngo_mentions.obj
- ./Data/ngo_mention_ids.csv
- ./Data/publication_counts.csv
- ./Data/month_year_counts.csv
extract pos:
help: extract verbs from sentences that mention NGOs
dependencies:
- ./Data/ngos.csv
- ./Data/ngo_mentions.obj
- ./Corpora/egypt_independent.db
- ./Corpora/ahram.db
- ./Corpora/dne.db
formula: >
python3 Python/extract_pos.py
output:
- ./Output/source_list_WILL_BE_OVERWRITTEN.xlsx
- ./Output/verb_list.csv
export articles:
help: export articles from SQLite databases and stem and n-gram them
dependencies:
- ./Python/export_to_mallet.py
- ./Corpora/egypt_independent.db
- ./Corpora/ahram.db
- ./Corpora/dne.db
formula: >
mkdir Data/articles 2>/dev/null || true;
python3 Python/export_to_mallet.py Corpora/egypt_independent.db egypt_independent Data/articles;
python3 Python/export_to_mallet.py Corpora/ahram.db ahram Data/articles;
python3 Python/export_to_mallet.py Corpora/dne.db dne Data/articles;
output:
- ./Data/articles/*.txt
clean and process articles:
help: stem and clean up exported articles
dependencies:
- ./Python/process_natural_language.py
- ./Data/articles/*.txt
formula: >
mkdir Data/articles_stemmed 2>/dev/null || true;
python2 Python/process_natural_language.py Data/articles/ Data/articles_stemmed Corpora/stopwords.txt Output/bigrams.csv
output:
- ./Data/articles_stemmed/*.txt
- ./Output/bigrams.csv
analyze data:
help: load data and build all models
load data into R:
help: load NGO articles into R
dependencies:
- ./Data/ngos.csv
- ./Data/ngo_mention_ids.csv
- ./R/load_data.R
formula: >
cd R; Rscript load_data.R
output:
- ./Data/mentions_data.RData
- ./Data/full_text.RData
build topic model:
help: create a topic model using the stemmed articles
dependencies:
- ./R/create_topic_model.R
- ./Data/articles_stemmed/*.txt
formula: >
cd R; Rscript create_topic_model.R
output:
- ./Data/topic_model.RData
- ./Data/topics.mallet
- ./Data/topic-state.gz
- ./Data/topic-keys.txt
- ./Data/topic-doctopics.txt
- ./Data/topic-docs.csv
generate output:
help: generate all output for the paper and presentation
output analysis:
help: "summary tables and plots of sources, organzations, and topics"
dependencies:
- ./Data/topic_model.RData
- ./Data/mentions_data.RData
- ./Data/full_text.RData
- ./Data/source_list.xlsx
- ./Data/publication_counts.csv
- ./Data/month_year_counts.csv
formula: >
cd R; Rscript analyze_sources_orgs_topics.R
output:
- ./Output/table_topic_model.md
- ./Output/table_article_validation.md
- ./Output/table_source_mention_ratio.md
- ./Output/table_source_organization.md
- ./Output/table_source_publication.md
- ./Output/plot_article_validation.pdf
- ./Output/plot_article_validation.png
- ./Output/plot_org_topic_validation.pdf
- ./Output/plot_org_topic_validation.png
- ./Output/plot_sources_time.pdf
- ./Output/plot_sources_time.png
- ./Output/plot_mosaic.pdf
- ./Output/plot_mosaic.png
- ./Output/plot_source_pub.pdf
- ./Output/plot_source_pub.png
- ./Output/plot_topic_pub.pdf
- ./Output/plot_topic_pub.png
- ./Output/plot_topic_pub_org.pdf
- ./Output/plot_topic_pub_org.png
...