diff --git a/README.md b/README.md
index a08c21f..7953af7 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,12 @@ __danker__ is a compilation of Bash and Python3 scripts that enables the computa
PageRank damping factor. (default: 0.85)
-s START, --start START
PageRank starting value. (default: 0.1)
+ -t DUMP_DATE, --dump-date DUMP_DATE
+ Dump date in the format YYYYMMDD (defaults to latest).
+ (default: None)
+ -f FOLDER, --folder FOLDER
+ Folder with existing dumps, needs to match project and
+ dump-date parameters. (default: None)
-b, --bigmem PageRank big memory flag. (default: False)
-l, --links Only extract links (skip PageRank). (default: False)
```
diff --git a/script/args.py b/script/args.py
index dded48c..e0e6900 100755
--- a/script/args.py
+++ b/script/args.py
@@ -41,6 +41,9 @@ def main():
help='PageRank damping factor.')
parser.add_argument('-s', '--start', type=float, default=0.1,
help='PageRank starting value.')
+ parser.add_argument('-t', '--dump-date', type=str, help='Dump date in the format YYYYMMDD (defaults to latest).')
+ parser.add_argument('-f', '--folder', type=str, help='Folder with existing dumps, ' +
+ 'needs to match project and dump-date parameters.')
parser.add_argument('-b', '--bigmem', action='store_true',
help='PageRank big memory flag.')
parser.add_argument('-l', '--links', action='store_true',
@@ -49,14 +52,17 @@ def main():
# Preparing arguments for Bash
if args.project:
- if args.project != 'wiki':
- print('-p', args.project, end='')
+ print('-p', args.project, end='')
if args.iterations:
print('', '-i', args.iterations, end='')
if args.damping:
print('', '-d', args.damping, end='')
if args.start:
print('', '-s', args.start, end='')
+ if args.dump_date:
+ print('', '-t', args.dump_date, end='')
+ if args.folder:
+ print('', '-f', args.folder, end='')
if args.bigmem:
print('', '-b', end='')
if args.links:
diff --git a/script/create_links.sh b/script/create_links.sh
index 99bb82b..9f229eb 100755
--- a/script/create_links.sh
+++ b/script/create_links.sh
@@ -16,9 +16,48 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
-
dir=$(dirname "$0")
+latest_dump() {
+ rss="https://dumps.wikimedia.org/$wiki/latest/$wiki-latest-"
+ # Latest dump date
+ if wget -q "$rss""page.sql.gz-rss.xml" \
+ "$rss""pagelinks.sql.gz-rss.xml" \
+ "$rss""redirect.sql.gz-rss.xml" \
+ "$rss""page_props.sql.gz-rss.xml"; then
+ dump_date=$(cat "$wiki"*.xml | sed -n "s#.*$download\([0-9]\+\).*#\1#p" | sort -u)
+ fi
+
+ if [ "$(echo "$dump_date" | wc -l)" != '1' ] || [ "$dump_date" == '' ]; then
+ (>&2 printf "[Error]\tMultiple or no date for '%s' dump.\n" "$wiki.")
+ return 1
+ fi
+
+ rm "$wiki-latest-page.sql.gz-rss.xml" \
+ "$wiki-latest-pagelinks.sql.gz-rss.xml" \
+ "$wiki-latest-redirect.sql.gz-rss.xml" \
+ "$wiki-latest-page_props.sql.gz-rss.xml"
+ echo "$dump_date"
+}
+
+download() {
+ tmpdir=$(mktemp -d -t "danker.XXXX")
+ cd "$tmpdir" || return 1
+
+ # Download and unzip
+ if ! wget -q --waitretry=1m --retry-connrefused "$download$dump_date/$page.gz" \
+ "$download$dump_date/$pagelinks.gz" \
+ "$download$dump_date/$redirect.gz" \
+ "$download$dump_date/$pageprops.gz"; then
+ (>&2 printf "Couldn't download dumps of '%s' for date '%s'.\n" "$wiki" "$dump_date")
+ return 1
+ fi
+
+ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
+ echo "$tmpdir"
+}
+
+
if [ ! "$1" ]; then
(>&2 printf "[Error]\tMissing positional wiki language parameter.
\tExamples: [en, de, bar, ...]\n")
@@ -39,48 +78,31 @@ if [ ! "$2" ]; then
fi
wiki="$1$project"
-# Location of wikipedia dumps
+# Download location of dumps for project
download="http://download.wikimedia.org/$wiki/"
-rss="https://dumps.wikimedia.org/$wiki/latest/$wiki-latest-"
-
-# Latest dump date
-if wget -q "$rss""page.sql.gz-rss.xml" \
- "$rss""pagelinks.sql.gz-rss.xml" \
- "$rss""redirect.sql.gz-rss.xml" \
- "$rss""page_props.sql.gz-rss.xml"; then
- dump_date=$(cat "$wiki"*.xml | sed -n "s#.*$download\([0-9]\+\).*#\1#p" | sort -u)
-fi
-if [ "$(echo "$dump_date" | wc -l)" != '1' ] || [ "$dump_date" == '' ]; then
- (>&2 printf "[Error]\tMultiple or no date for '%s' dump.\n" "$wiki")
- exit 1
+# Take latest if no date is specified
+if [ ! "$3" ]; then
+ dump_date=$(latest_dump) || exit 1
+else
+ dump_date="$3"
fi
-rm "$wiki-latest-page.sql.gz-rss.xml" \
- "$wiki-latest-pagelinks.sql.gz-rss.xml" \
- "$wiki-latest-redirect.sql.gz-rss.xml" \
- "$wiki-latest-page_props.sql.gz-rss.xml"
-
-# File locations
+# File names are now fully specified
page="$wiki-""$dump_date""-page.sql"
pagelinks="$wiki-""$dump_date""-pagelinks.sql"
redirect="$wiki-""$dump_date""-redirect.sql"
pageprops="$wiki-""$dump_date""-page_props.sql"
-# Download and unzip
-
-if ! wget -q --waitretry=1m --retry-connrefused "$download$dump_date/$page.gz" \
- "$download$dump_date/$pagelinks.gz" \
- "$download$dump_date/$redirect.gz" \
- "$download$dump_date/$pageprops.gz"; then
- (>&2 printf "Couldn't download dumps of '%s'.\n" "$wiki")
- exit 1
+# If a folder is provided, take the files from the folder
+if [ ! "$4" ]; then
+ file_dir=$(download) || exit 1
+else
+ file_dir="$4"
fi
-gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
-
# Pre-process
-"$dir"/maria2csv.py "$page" \
+"$dir"/maria2csv.py "$file_dir/$page" \
| csvformat -q "'" -b -p "\\" \
| csvcut -c page_id,page_namespace,page_title \
| csvgrep -c page_namespace -r "^0$|^14$" \
@@ -89,7 +111,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \
> "$wiki"page.lines
-"$dir"/maria2csv.py "$pagelinks" \
+"$dir"/maria2csv.py "$file_dir/$pagelinks" \
| csvformat -q "'" -b -p "\\" \
| csvgrep -c pl_from_namespace -r "^0$|^14$" \
| csvgrep -c pl_namespace -r "^0$|^14$" \
@@ -99,7 +121,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \
> "$wiki"pagelinks.lines
-"$dir"/maria2csv.py "$redirect" \
+"$dir"/maria2csv.py "$file_dir/$redirect" \
| csvformat -q "'" -b -p "\\" \
| csvcut -c rd_from,rd_namespace,rd_title \
| csvgrep -c rd_namespace -r "^0$|^14$" \
@@ -108,7 +130,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \
> "$wiki"redirect.lines
-"$dir"/maria2csv.py "$pageprops" \
+"$dir"/maria2csv.py "$file_dir/$pageprops" \
| csvformat -q "'" -b -p "\\" \
| csvcut -c pp_page,pp_propname,pp_value \
| csvgrep -c pp_propname -r "^wikibase_item$" \
@@ -117,8 +139,10 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| tail -n+2 \
> "$wiki"pageprops.lines
-# Delete sql files.
-rm "$page" "$pagelinks" "$redirect" "$pageprops"
+# Delete files if in tmp dir
+if [ "$(dirname "$file_dir")" == "/tmp" ]; then
+ rm -rf "$file_dir"
+fi
# To avoid any locale-related issues, it
# is recommended to use the āCā locale [...].
diff --git a/script/dank.sh b/script/dank.sh
index 2ee9ccd..f1f2065 100755
--- a/script/dank.sh
+++ b/script/dank.sh
@@ -16,7 +16,11 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
-while getopts ":p:i:d:s:bl" a; do
+# defaults
+project="wiki"
+dump_time=""
+folder=""
+while getopts ":p:i:d:s:t:f:bl" a; do
case "${a}" in
p)
project=${OPTARG}
@@ -30,6 +34,12 @@ while getopts ":p:i:d:s:bl" a; do
s)
start_value=${OPTARG}
;;
+ t)
+ dump_time=${OPTARG}
+ ;;
+ f)
+ folder=${OPTARG}
+ ;;
b)
bigmem=1
;;
@@ -50,10 +60,10 @@ if [ ! "$1" ]; then
fi
if [ "$1" == "ALL" ]; then
- filename=$(date +"%Y-%m-%d").allwiki"$project".links
+ filename=$(date +"%Y-%m-%d").all"$project".links
if languages=$(./script/get_languages.sh "$project"); then
for i in $languages; do
- ./script/create_links.sh "$i" "$project" >> "$filename.files.txt"
+ ./script/create_links.sh "$i" "$project" "$dump_time" "$folder" >> "$filename.files.txt"
done
while IFS= read -r i
@@ -80,7 +90,7 @@ if [ "$1" == "ALL" ]; then
exit 1
fi
else
- filename=$(./script/create_links.sh "$1" "$project")
+ filename=$(./script/create_links.sh "$1" "$project" "$dump_time" "$folder")
fi
# "extract links only" option
diff --git a/script/get_languages.sh b/script/get_languages.sh
index 983c818..19aafed 100755
--- a/script/get_languages.sh
+++ b/script/get_languages.sh
@@ -21,7 +21,7 @@
# exit 0
declare -A WIKIS
-WIKIS=( ["wikibooks"]="wb" ["wikisource"]="ws" ["wikiversity"]="wv"
+WIKIS=( ["wiki"]="wp" ["wikibooks"]="wb" ["wikisource"]="ws" ["wikiversity"]="wv"
["wikinews"]="wn" ["wiktionary"]="wt" ["wikiquote"]="wq")
# default is normal Wikipedia (wp)