Skip to content

Commit

Permalink
Merge pull request #14 from athalhammer/refactor/separate_date_download
Browse files Browse the repository at this point in the history
Refactor/separate date download
#11
  • Loading branch information
athalhammer committed Jul 29, 2020
2 parents 2ddb4c8 + 664de0f commit 72ef841
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 43 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ __danker__ is a compilation of Bash and Python3 scripts that enables the computa
PageRank damping factor. (default: 0.85)
-s START, --start START
PageRank starting value. (default: 0.1)
-t DUMP_DATE, --dump-date DUMP_DATE
Dump date in the format YYYYMMDD (defaults to latest).
(default: None)
-f FOLDER, --folder FOLDER
Folder with existing dumps, needs to match project and
dump-date parameters. (default: None)
-b, --bigmem PageRank big memory flag. (default: False)
-l, --links Only extract links (skip PageRank). (default: False)
```
Expand Down
10 changes: 8 additions & 2 deletions script/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def main():
help='PageRank damping factor.')
parser.add_argument('-s', '--start', type=float, default=0.1,
help='PageRank starting value.')
parser.add_argument('-t', '--dump-date', type=str, help='Dump date in the format YYYYMMDD (defaults to latest).')
parser.add_argument('-f', '--folder', type=str, help='Folder with existing dumps, ' +
'needs to match project and dump-date parameters.')
parser.add_argument('-b', '--bigmem', action='store_true',
help='PageRank big memory flag.')
parser.add_argument('-l', '--links', action='store_true',
Expand All @@ -49,14 +52,17 @@ def main():

# Preparing arguments for Bash
if args.project:
if args.project != 'wiki':
print('-p', args.project, end='')
print('-p', args.project, end='')
if args.iterations:
print('', '-i', args.iterations, end='')
if args.damping:
print('', '-d', args.damping, end='')
if args.start:
print('', '-s', args.start, end='')
if args.dump_date:
print('', '-t', args.dump_date, end='')
if args.folder:
print('', '-f', args.folder, end='')
if args.bigmem:
print('', '-b', end='')
if args.links:
Expand Down
96 changes: 60 additions & 36 deletions script/create_links.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,48 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


dir=$(dirname "$0")

latest_dump() {
rss="https://dumps.wikimedia.org/$wiki/latest/$wiki-latest-"
# Latest dump date
if wget -q "$rss""page.sql.gz-rss.xml" \
"$rss""pagelinks.sql.gz-rss.xml" \
"$rss""redirect.sql.gz-rss.xml" \
"$rss""page_props.sql.gz-rss.xml"; then
dump_date=$(cat "$wiki"*.xml | sed -n "s#.*$download\([0-9]\+\).*#\1#p" | sort -u)
fi

if [ "$(echo "$dump_date" | wc -l)" != '1' ] || [ "$dump_date" == '' ]; then
(>&2 printf "[Error]\tMultiple or no date for '%s' dump.\n" "$wiki.")
return 1
fi

rm "$wiki-latest-page.sql.gz-rss.xml" \
"$wiki-latest-pagelinks.sql.gz-rss.xml" \
"$wiki-latest-redirect.sql.gz-rss.xml" \
"$wiki-latest-page_props.sql.gz-rss.xml"
echo "$dump_date"
}

download() {
tmpdir=$(mktemp -d -t "danker.XXXX")
cd "$tmpdir" || return 1

# Download and unzip
if ! wget -q --waitretry=1m --retry-connrefused "$download$dump_date/$page.gz" \
"$download$dump_date/$pagelinks.gz" \
"$download$dump_date/$redirect.gz" \
"$download$dump_date/$pageprops.gz"; then
(>&2 printf "Couldn't download dumps of '%s' for date '%s'.\n" "$wiki" "$dump_date")
return 1
fi

gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
echo "$tmpdir"
}


if [ ! "$1" ]; then
(>&2 printf "[Error]\tMissing positional wiki language parameter.
\tExamples: [en, de, bar, ...]\n")
Expand All @@ -39,48 +78,31 @@ if [ ! "$2" ]; then
fi
wiki="$1$project"

# Location of wikipedia dumps
# Download location of dumps for project
download="http://download.wikimedia.org/$wiki/"
rss="https://dumps.wikimedia.org/$wiki/latest/$wiki-latest-"

# Latest dump date
if wget -q "$rss""page.sql.gz-rss.xml" \
"$rss""pagelinks.sql.gz-rss.xml" \
"$rss""redirect.sql.gz-rss.xml" \
"$rss""page_props.sql.gz-rss.xml"; then
dump_date=$(cat "$wiki"*.xml | sed -n "s#.*$download\([0-9]\+\).*#\1#p" | sort -u)
fi

if [ "$(echo "$dump_date" | wc -l)" != '1' ] || [ "$dump_date" == '' ]; then
(>&2 printf "[Error]\tMultiple or no date for '%s' dump.\n" "$wiki")
exit 1
# Take latest if no date is specified
if [ ! "$3" ]; then
dump_date=$(latest_dump) || exit 1
else
dump_date="$3"
fi

rm "$wiki-latest-page.sql.gz-rss.xml" \
"$wiki-latest-pagelinks.sql.gz-rss.xml" \
"$wiki-latest-redirect.sql.gz-rss.xml" \
"$wiki-latest-page_props.sql.gz-rss.xml"

# File locations
# File names are now fully specified
page="$wiki-""$dump_date""-page.sql"
pagelinks="$wiki-""$dump_date""-pagelinks.sql"
redirect="$wiki-""$dump_date""-redirect.sql"
pageprops="$wiki-""$dump_date""-page_props.sql"

# Download and unzip

if ! wget -q --waitretry=1m --retry-connrefused "$download$dump_date/$page.gz" \
"$download$dump_date/$pagelinks.gz" \
"$download$dump_date/$redirect.gz" \
"$download$dump_date/$pageprops.gz"; then
(>&2 printf "Couldn't download dumps of '%s'.\n" "$wiki")
exit 1
# If a folder is provided, take the files from the folder
if [ ! "$4" ]; then
file_dir=$(download) || exit 1
else
file_dir="$4"
fi

gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"

# Pre-process
"$dir"/maria2csv.py "$page" \
"$dir"/maria2csv.py "$file_dir/$page" \
| csvformat -q "'" -b -p "\\" \
| csvcut -c page_id,page_namespace,page_title \
| csvgrep -c page_namespace -r "^0$|^14$" \
Expand All @@ -89,7 +111,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \
> "$wiki"page.lines

"$dir"/maria2csv.py "$pagelinks" \
"$dir"/maria2csv.py "$file_dir/$pagelinks" \
| csvformat -q "'" -b -p "\\" \
| csvgrep -c pl_from_namespace -r "^0$|^14$" \
| csvgrep -c pl_namespace -r "^0$|^14$" \
Expand All @@ -99,7 +121,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \
> "$wiki"pagelinks.lines

"$dir"/maria2csv.py "$redirect" \
"$dir"/maria2csv.py "$file_dir/$redirect" \
| csvformat -q "'" -b -p "\\" \
| csvcut -c rd_from,rd_namespace,rd_title \
| csvgrep -c rd_namespace -r "^0$|^14$" \
Expand All @@ -108,7 +130,7 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| sed "s/\([0-9]\+\)\t\([0-9]\+\)\t\(.*\)/\1\t\2\3/" \
> "$wiki"redirect.lines

"$dir"/maria2csv.py "$pageprops" \
"$dir"/maria2csv.py "$file_dir/$pageprops" \
| csvformat -q "'" -b -p "\\" \
| csvcut -c pp_page,pp_propname,pp_value \
| csvgrep -c pp_propname -r "^wikibase_item$" \
Expand All @@ -117,8 +139,10 @@ gunzip "$page.gz" "$pagelinks.gz" "$redirect.gz" "$pageprops.gz"
| tail -n+2 \
> "$wiki"pageprops.lines

# Delete sql files.
rm "$page" "$pagelinks" "$redirect" "$pageprops"
# Delete files if in tmp dir
if [ "$(dirname "$file_dir")" == "/tmp" ]; then
rm -rf "$file_dir"
fi

# To avoid any locale-related issues, it
# is recommended to use the ‘C’ locale [...].
Expand Down
18 changes: 14 additions & 4 deletions script/dank.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

while getopts ":p:i:d:s:bl" a; do
# defaults
project="wiki"
dump_time=""
folder=""
while getopts ":p:i:d:s:t:f:bl" a; do
case "${a}" in
p)
project=${OPTARG}
Expand All @@ -30,6 +34,12 @@ while getopts ":p:i:d:s:bl" a; do
s)
start_value=${OPTARG}
;;
t)
dump_time=${OPTARG}
;;
f)
folder=${OPTARG}
;;
b)
bigmem=1
;;
Expand All @@ -50,10 +60,10 @@ if [ ! "$1" ]; then
fi

if [ "$1" == "ALL" ]; then
filename=$(date +"%Y-%m-%d").allwiki"$project".links
filename=$(date +"%Y-%m-%d").all"$project".links
if languages=$(./script/get_languages.sh "$project"); then
for i in $languages; do
./script/create_links.sh "$i" "$project" >> "$filename.files.txt"
./script/create_links.sh "$i" "$project" "$dump_time" "$folder" >> "$filename.files.txt"
done

while IFS= read -r i
Expand All @@ -80,7 +90,7 @@ if [ "$1" == "ALL" ]; then
exit 1
fi
else
filename=$(./script/create_links.sh "$1" "$project")
filename=$(./script/create_links.sh "$1" "$project" "$dump_time" "$folder")
fi

# "extract links only" option
Expand Down
2 changes: 1 addition & 1 deletion script/get_languages.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# exit 0

declare -A WIKIS
WIKIS=( ["wikibooks"]="wb" ["wikisource"]="ws" ["wikiversity"]="wv"
WIKIS=( ["wiki"]="wp" ["wikibooks"]="wb" ["wikisource"]="ws" ["wikiversity"]="wv"
["wikinews"]="wn" ["wiktionary"]="wt" ["wikiquote"]="wq")

# default is normal Wikipedia (wp)
Expand Down

0 comments on commit 72ef841

Please sign in to comment.