/
settings.py
91 lines (69 loc) · 3.1 KB
/
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
##settings file for EcologIE PMCminer multi-species interactions extractor
from nltk.data import load as NLTKload
#set directories for your own system...
ARCHIVE_DIR = "/home/david/Avidastuff/mining/pubmedoa/allPLos_One/" # Your archive of PMC .xml files
BIN = "pmcminer"
DATA_DIR = "/home/david/textmining/PMCdata" # where you want to put your data
# These are below your DATA_DIR
RAW_ARTICLES_DIR = "raw_articles"
PURE_ARTICLES_DIR = "pure"
ANALYSIS_DIR = "analysis_fc"
#locations of databases in .tsv format
SPECIESDB = "/home/david/Avidastuff/mining/linnaeus/species/full_col_final.tsv"
INTERACTIONSDB = "/home/david/Avidastuff/mining/linnaeus/interactions/dict-community-ecology-interactions.tsv"
#State the doi lists to be used for the analysis
#DOI lists must be kept in DATA_DIR
#These names must be correct, even if you are not re-extracting the raw articles
#Downstream analysis relies on the keys being correct
DOI_LISTS = {
"Ecology": "DOI_lists/subjectEcology",
"Molecular": "DOI_lists/subjectMolecular",
"Genomics": "DOI_lists/subjectGenomics",
"Biochemistry": "DOI_lists/subjectBiochemistry"
}
#Match DOIs with articles in ANALYSIS_DIR and copy to DATA_DIR/raw_articles ?
EXTRACT_RAW_ARTICLES_BY_DOI = True
#copy processed article bodies to DATA_DIR/pure ?
PURIFY_ARTICLE_SET = True
TOKENIZER = NLTKload('tokenizers/punkt/english.pickle')
#Stopword linefeeds to be removed by ArticleScrubber:
STOPS = ["i.e.\n", "e.g.\n", "Refs.\n", " et al.\n", "Fig.\n", "cf.\n",
"c.i.\n", " ca.\n", "spp.\n", " ver.\n", " ind.\n", " obs.\n"]
### LINNAEUS OPTIONS ###
#Run linnaeus tagger ?
TAG_SPECIES = True
TAG_INTERACTIONS = True
#linnaeus tags are stored relative to DATA_DIR
TAGS_DIR = "tags_fc"
#path to Linnaeus jar file
LINNAEUS_JAR = "/home/david/Avidastuff/mining/linnaeus/bin/linnaeus-1.5.jar"
#How much RAM to allocate to Linnaeus?
#Should be high enough to accommodate the loading of the entity dictionaries.
#See guide-linneaus.txt for more details
LINNAEUS_MEMORY = "4G"
# Linnaeus conf files should be kept in control directory
LINNAEUS_CONF = (
{
"tags": "species",
"conf_file": "control/species.conf"
},
{
"tags": "interactions",
"conf_file": "control/interactions.conf"
},
)
### ANALYSIS OPTIONS ###
#all files go to ANALYSIS_DIR
ANALYSE_INTERACTIONS = True
MULTISPP_INTERACTIONS = "multispecies_interactions.tsv"
ALL_INTERACTIONS = "all_interactions.tsv"
MULTISPP_INTERACTIONS_TABLE = "multispecies_interactions_table.tsv"
ANALYSE_LINNAEUS_FILES = True
SPP_PER_ARTICLE_FILE = "species_per_article.tsv"
INTERACTIONS_PER_ARTICLE_FILE = "interactions_per_article.tsv"
SPP_NUMBERS_FILE = "species_numbers.tsv"
UNAMBIG_SPP_NUMBERS_FILE = "species_numbers_unambig.tsv"
AMBIG_SPP_NUMBERS_FILE = "species_numbers_ambig.tsv"
SUMMARY_STATS = True
SUMMARY_SUBJECT_ORDER = ("Ecology", "Genomics", "Molecular", "Biochemistry")
SUMMARY_STATS_TABLE = "SummaryStats.tsv"