diff --git a/README.md b/README.md index a08c21f..7953af7 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,12 @@ __danker__ is a compilation of Bash and Python3 scripts that enables the computa PageRank damping factor. (default: 0.85) -s START, --start START PageRank starting value. (default: 0.1) + -t DUMP_DATE, --dump-date DUMP_DATE + Dump date in the format YYYYMMDD (defaults to latest). + (default: None) + -f FOLDER, --folder FOLDER + Folder with existing dumps, needs to match project and + dump-date parameters. (default: None) -b, --bigmem PageRank big memory flag. (default: False) -l, --links Only extract links (skip PageRank). (default: False) ``` diff --git a/script/args.py b/script/args.py index dded48c..e0e6900 100755 --- a/script/args.py +++ b/script/args.py @@ -41,6 +41,9 @@ def main(): help='PageRank damping factor.') parser.add_argument('-s', '--start', type=float, default=0.1, help='PageRank starting value.') + parser.add_argument('-t', '--dump-date', type=str, help='Dump date in the format YYYYMMDD (defaults to latest).') + parser.add_argument('-f', '--folder', type=str, help='Folder with existing dumps, ' + + 'needs to match project and dump-date parameters.') parser.add_argument('-b', '--bigmem', action='store_true', help='PageRank big memory flag.') parser.add_argument('-l', '--links', action='store_true', @@ -49,14 +52,17 @@ def main(): # Preparing arguments for Bash if args.project: - if args.project != 'wiki': - print('-p', args.project, end='') + print('-p', args.project, end='') if args.iterations: print('', '-i', args.iterations, end='') if args.damping: print('', '-d', args.damping, end='') if args.start: print('', '-s', args.start, end='') + if args.dump_date: + print('', '-t', args.dump_date, end='') + if args.folder: + print('', '-f', args.folder, end='') if args.bigmem: print('', '-b', end='') if args.links: diff --git a/script/create_links.sh b/script/create_links.sh index 99bb82b..9f229eb 100755 --- a/script/create_links.sh +++ b/script/create_links.sh @@ -16,9 +16,48 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . - dir=$(dirname "$0") +latest_dump() { + rss="https://dumps.wikimedia.org/$wiki/latest/$wiki-latest-" + # Latest dump date + if wget -q "$rss""page.sql.gz-rss.xml" \ + "$rss""pagelinks.sql.gz-rss.xml" \ + "$rss""redirect.sql.gz-rss.xml" \ + "$rss""page_props.sql.gz-rss.xml"; then + dump_date=$(cat "$wiki"*.xml | sed -n "s#.*$download\([0-9]\+\).*#\1#p" | sort -u) + fi + + if [ "$(echo "$dump_date" | wc -l)" != '1' ] || [ "$dump_date" == '' ]; then + (>&2 printf "[Error]\tMultiple or no date for '%s' dump.\n" "$wiki.") + return 1 + fi + + rm "$wiki-latest-page.sql.gz-rss.xml" \ + "$wiki-latest-pagelinks.sql.gz-rss.xml" \ + "$wiki-latest-redirect.sql.gz-rss.xml" \ + "$wiki-latest-page_props.sql.gz-rss.xml" + echo "$dump_date" +} + +download() { + tmpdir=$(mktemp -d -t "danker.XXXX") + cd "$tmpdir" || return 1 + + # Download and unzip + if ! wget -q --waitretry=1m --retry-connrefused "$download$dump_date/$page.gz" \ + "$download$dump_date/$pagelinks.gz" \ + "$download$dump_date/$redirect.gz" \ + "$download$dump_date/$pageprops.gz"; then + (>&2 printf "Couldn't download dumps of '%s' for date '%s'.\n" "$wiki" "$dump_date") + return 1 + fi + + gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz" + echo "$tmpdir" +} + + if [ ! "$1" ]; then (>&2 printf "[Error]\tMissing positional wiki language parameter. \tExamples: [en, de, bar, ...]\n") @@ -39,48 +78,31 @@ if [ ! "$2" ]; then fi wiki="$1$project" -# Location of wikipedia dumps +# Download location of dumps for project download="http://download.wikimedia.org/$wiki/" -rss="https://dumps.wikimedia.org/$wiki/latest/$wiki-latest-" - -# Latest dump date -if wget -q "$rss""page.sql.gz-rss.xml" \ - "$rss""pagelinks.sql.gz-rss.xml" \ - "$rss""redirect.sql.gz-rss.xml" \ - "$rss""page_props.sql.gz-rss.xml"; then - dump_date=$(cat "$wiki"*.xml | sed -n "s#.*$download\([0-9]\+\).*#\1#p" | sort -u) -fi -if [ "$(echo "$dump_date" | wc -l)" != '1' ] || [ "$dump_date" == '' ]; then - (>&2 printf "[Error]\tMultiple or no date for '%s' dump.\n" "$wiki") - exit 1 +# Take latest if no date is specified +if [ ! "$3" ]; then + dump_date=$(latest_dump) || exit 1 +else + dump_date="$3" fi -rm "$wiki-latest-page.sql.gz-rss.xml" \ - "$wiki-latest-pagelinks.sql.gz-rss.xml" \ - "$wiki-latest-redirect.sql.gz-rss.xml" \ - "$wiki-latest-page_props.sql.gz-rss.xml" - -# File locations +# File names are now fully specified page="$wiki-""$dump_date""-page.sql" pagelinks="$wiki-""$dump_date""-pagelinks.sql" redirect="$wiki-""$dump_date""-redirect.sql" pageprops="$wiki-""$dump_date""-page_props.sql" -# Download and unzip - -if ! wget -q --waitretry=1m --retry-connrefused "$download$dump_date/$page.gz" \ - "$download$dump_date/$pagelinks.gz" \ - "$download$dump_date/$redirect.gz" \ - "$download$dump_date/$pageprops.gz"; then - (>&2 printf "Couldn't download dumps of '%s'.\n" "$wiki") - exit 1 +# If a folder is provided, take the files from the folder +if [ ! "$4" ]; then + file_dir=$(download) || exit 1 +else + file_dir="$4" fi -gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz" - # Pre-process -"$dir"/maria2csv.py "$page" \ +"$dir"/maria2csv.py "$file_dir/$page" \ | csvformat -q "'" -b -p "\\" \ | csvcut -c page_id,page_namespace,page_title \ | csvgrep -c page_namespace -r "^0$|^14$" \ @@ -89,7 +111,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz" | sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \ > "$wiki"page.lines -"$dir"/maria2csv.py "$pagelinks" \ +"$dir"/maria2csv.py "$file_dir/$pagelinks" \ | csvformat -q "'" -b -p "\\" \ | csvgrep -c pl_from_namespace -r "^0$|^14$" \ | csvgrep -c pl_namespace -r "^0$|^14$" \ @@ -99,7 +121,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz" | sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \ > "$wiki"pagelinks.lines -"$dir"/maria2csv.py "$redirect" \ +"$dir"/maria2csv.py "$file_dir/$redirect" \ | csvformat -q "'" -b -p "\\" \ | csvcut -c rd_from,rd_namespace,rd_title \ | csvgrep -c rd_namespace -r "^0$|^14$" \ @@ -108,7 +130,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz" | sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \ > "$wiki"redirect.lines -"$dir"/maria2csv.py "$pageprops" \ +"$dir"/maria2csv.py "$file_dir/$pageprops" \ | csvformat -q "'" -b -p "\\" \ | csvcut -c pp_page,pp_propname,pp_value \ | csvgrep -c pp_propname -r "^wikibase_item$" \ @@ -117,8 +139,10 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz" | tail -n+2 \ > "$wiki"pageprops.lines -# Delete sql files. -rm "$page" "$pagelinks" "$redirect" "$pageprops" +# Delete files if in tmp dir +if [ "$(dirname "$file_dir")" == "/tmp" ]; then + rm -rf "$file_dir" +fi # To avoid any locale-related issues, it # is recommended to use the ā€˜Cā€™ locale [...]. diff --git a/script/dank.sh b/script/dank.sh index 2ee9ccd..f1f2065 100755 --- a/script/dank.sh +++ b/script/dank.sh @@ -16,7 +16,11 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -while getopts ":p:i:d:s:bl" a; do +# defaults +project="wiki" +dump_time="" +folder="" +while getopts ":p:i:d:s:t:f:bl" a; do case "${a}" in p) project=${OPTARG} @@ -30,6 +34,12 @@ while getopts ":p:i:d:s:bl" a; do s) start_value=${OPTARG} ;; + t) + dump_time=${OPTARG} + ;; + f) + folder=${OPTARG} + ;; b) bigmem=1 ;; @@ -50,10 +60,10 @@ if [ ! "$1" ]; then fi if [ "$1" == "ALL" ]; then - filename=$(date +"%Y-%m-%d").allwiki"$project".links + filename=$(date +"%Y-%m-%d").all"$project".links if languages=$(./script/get_languages.sh "$project"); then for i in $languages; do - ./script/create_links.sh "$i" "$project" >> "$filename.files.txt" + ./script/create_links.sh "$i" "$project" "$dump_time" "$folder" >> "$filename.files.txt" done while IFS= read -r i @@ -80,7 +90,7 @@ if [ "$1" == "ALL" ]; then exit 1 fi else - filename=$(./script/create_links.sh "$1" "$project") + filename=$(./script/create_links.sh "$1" "$project" "$dump_time" "$folder") fi # "extract links only" option diff --git a/script/get_languages.sh b/script/get_languages.sh index 983c818..19aafed 100755 --- a/script/get_languages.sh +++ b/script/get_languages.sh @@ -21,7 +21,7 @@ # exit 0 declare -A WIKIS -WIKIS=( ["wikibooks"]="wb" ["wikisource"]="ws" ["wikiversity"]="wv" +WIKIS=( ["wiki"]="wp" ["wikibooks"]="wb" ["wikisource"]="ws" ["wikiversity"]="wv" ["wikinews"]="wn" ["wiktionary"]="wt" ["wikiquote"]="wq") # default is normal Wikipedia (wp)