Skip to content

Commit

Permalink
Merge pull request #13 from athalhammer/refactor/projects
Browse files Browse the repository at this point in the history
Refactor/projects #8
  • Loading branch information
athalhammer committed Jul 29, 2020
2 parents 388fee2 + 95572b0 commit 2ddb4c8
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 22 deletions.
6 changes: 6 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,11 @@ script:
- nosetests --with-coverage --cover-package=danker
- shellcheck *.sh */*.sh
- ./danker.sh ch
- ./danker.sh sk --project wikisource
- ./danker.sh cv --project wikibooks
- ./danker.sh th --project wikinews
- ./danker.sh hi --project wikiversity
- ./danker.sh uz --project wikiquote
- ./danker.sh ast --project wiktionary
after_success:
- coveralls
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ __danker__ is a compilation of Bash and Python3 scripts that enables the computa
optional arguments:
-h, --help show this help message and exit
-p PROJECT, --project PROJECT
Wiki project, currently supported [wiki, books,
source, versity, news]. (default: wiki)
Wiki project, currently supported are [wiki,
wikibooks, wikisource, wikiversity, wikinews,
wiktionary, wikiquote]. (default: wiki)
-i ITERATIONS, --iterations ITERATIONS
PageRank number of iterations. (default: 40)
-d DAMPING, --damping DAMPING
Expand Down
4 changes: 2 additions & 2 deletions script/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def main():
' "ALL" for computing PageRank over all' +
' languages available in a project.')
parser.add_argument('-p', '--project', type=str, default='wiki',
help='Wiki project, currently supported [wiki, books, source, ' +
'versity, news].')
help='Wiki project, currently supported are [wiki, wikibooks, ' +
'wikisource, wikiversity, wikinews, wiktionary, wikiquote].')
parser.add_argument('-i', '--iterations', type=int, default=40,
help='PageRank number of iterations.')
parser.add_argument('-d', '--damping', type=float, default=0.85,
Expand Down
38 changes: 21 additions & 17 deletions script/create_links.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,22 @@ if [ ! "$1" ]; then
else
invalid=$(grep "$1" <("$dir"/get_languages.sh "$2"))
if [ -z "$invalid" ]; then
(>&2 printf "[Error]\t'%s' is an invalid language parameter for 'wiki%s'.
(>&2 printf "[Error]\t'%s' is an invalid language parameter for '%s'.
\tPlease check: http://wikistats.wmflabs.org/display.php\n" "$1" "$2")
exit 1
fi
fi

wiki="$1"
project="$2"
# default to wiki project
if [ ! "$2" ]; then
project="wiki"
fi
wiki="$1$project"

# Location of wikipedia dumps
download="http://download.wikimedia.org/""$wiki""wiki""$project""/"
rss="https://dumps.wikimedia.org/""$wiki""wiki""$project""/latest/""$rss""$wiki""wiki""$project""-latest-"
download="http://download.wikimedia.org/$wiki/"
rss="https://dumps.wikimedia.org/$wiki/latest/$wiki-latest-"

# Latest dump date
if wget -q "$rss""page.sql.gz-rss.xml" \
Expand All @@ -52,16 +56,16 @@ if [ "$(echo "$dump_date" | wc -l)" != '1' ] || [ "$dump_date" == '' ]; then
exit 1
fi

rm "$wiki""wiki""$project""-latest-page.sql.gz-rss.xml" \
"$wiki""wiki""$project""-latest-pagelinks.sql.gz-rss.xml" \
"$wiki""wiki""$project""-latest-redirect.sql.gz-rss.xml" \
"$wiki""wiki""$project""-latest-page_props.sql.gz-rss.xml"
rm "$wiki-latest-page.sql.gz-rss.xml" \
"$wiki-latest-pagelinks.sql.gz-rss.xml" \
"$wiki-latest-redirect.sql.gz-rss.xml" \
"$wiki-latest-page_props.sql.gz-rss.xml"

# File locations
page="$wiki""wiki$project-""$dump_date""-page.sql"
pagelinks="$wiki""wiki$project-""$dump_date""-pagelinks.sql"
redirect="$wiki""wiki$project-""$dump_date""-redirect.sql"
pageprops="$wiki""wiki$project-""$dump_date""-page_props.sql"
page="$wiki-""$dump_date""-page.sql"
pagelinks="$wiki-""$dump_date""-pagelinks.sql"
redirect="$wiki-""$dump_date""-redirect.sql"
pageprops="$wiki-""$dump_date""-page_props.sql"

# Download and unzip

Expand Down Expand Up @@ -201,14 +205,14 @@ join -j 2 \
"$wiki""pagelinks.lines" \
"$wiki""pageprops.lines" \
-o 2.1,1.1 -t $'\t' \
| sed "s/\(Q\|q\)\(.*\)\t\(Q\|q\)\(.*\)/\2\t\4\t""$wiki""wiki$project-$dump_date/" \
> "$wiki""wiki""$project"-"$dump_date"".links"
| sed "s/\(Q\|q\)\(.*\)\t\(Q\|q\)\(.*\)/\2\t\4\t$wiki-$dump_date/" \
> "$wiki-$dump_date"".links"

# Sort final output, cleanup, and print filename
sort -k 1,1n -k 2,2n -u \
-S 50% -T . \
-o "$wiki""wiki""$project"-"$dump_date"".links" \
"$wiki""wiki""$project"-"$dump_date"".links"
-o "$wiki"-"$dump_date"".links" \
"$wiki-$dump_date"".links"

# Delete temporary files
rm "$wiki""page.lines" \
Expand All @@ -219,4 +223,4 @@ rm "$wiki""page.lines" \
"$wiki""pagelinks_redirected.lines" \
"$wiki""pageprops.lines"

echo "$wiki""wiki""$project"-"$dump_date"".links"
echo "$wiki-$dump_date"".links"
3 changes: 2 additions & 1 deletion script/get_languages.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
# exit 0

declare -A WIKIS
WIKIS=( ["books"]="wb" ["source"]="ws" ["versity"]="wv" ["news"]="wn" )
WIKIS=( ["wikibooks"]="wb" ["wikisource"]="ws" ["wikiversity"]="wv"
["wikinews"]="wn" ["wiktionary"]="wt" ["wikiquote"]="wq")

# default is normal Wikipedia (wp)
project="wp"
Expand Down

0 comments on commit 2ddb4c8

Please sign in to comment.