GitHub - adamwulf/wikipedia2text: Tools to convert wikipedia to text

Branches Tags

Name		Name	Last commit message	Last commit date
Latest commit History 29 Commits
samples		samples
wiki2xml		wiki2xml
BeautifulSoup.py		BeautifulSoup.py
BeautifulSoup.pyc		BeautifulSoup.pyc
CHANGES		CHANGES
README		README
build-corpus.sh		build-corpus.sh
check-all.sh		check-all.sh
check.sh		check.sh
clean-txt.sh		clean-txt.sh
combine-plaintext.sh		combine-plaintext.sh
echo-text-from-xml.sh		echo-text-from-xml.sh
extracttop.py		extracttop.py
footer		footer
generate-corpus.sh		generate-corpus.sh
header		header
into16.sl		into16.sl
into64.sl		into64.sl
into8.sl		into8.sl
into8xml.sl		into8xml.sl
launch-all.sh		launch-all.sh
launch.sh		launch.sh
makecorpus.sl		makecorpus.sl
mkLaunch.sl		mkLaunch.sl
monitor-php-process.sh		monitor-php-process.sh
pass2.sh		pass2.sh
removeplain8.sl		removeplain8.sl
show-disk.sh		show-disk.sh
show-php-processes.sh		show-php-processes.sh
showlines.sh		showlines.sh
sleep.jar		sleep.jar
split-xml.sl		split-xml.sl
stop.command.bak		stop.command.bak
watchthem.sl		watchthem.sl
wiki2xml_all.sh		wiki2xml_all.sh
wikiextract.py		wikiextract.py
wikisoup.py		wikisoup.py
wikisoup.pyc		wikisoup.pyc
xmldump2files.py		xmldump2files.py

Repository files navigation

# Sample command to convert wiki-markup to XML
php ../mediawiki-1.28.2/maintenance/parse.php  ../../data/wikipedia-articles/af/af/Wikipedia%3AToday%27s_second_feature%2FJanuary_1%2C_2006.txt > sample


# Generate 8 scripts that will convert all wiki files to xml files
java -jar sleep.jar into8.sl ../../data/wikipedia-xml/en.files &

# Launch all 8 scripts to generate XML from wiki-markup

./launch-all.sh


# Find XML files and concat paths into single file
find ../../data/wikipedia-articles -type f -name *.xml > ../../data/wikipedia-xml/en.xml.files

# Split list of XML files into 8 lists of XML files, so we can generate 1 corpus per list
java -jar sleep.jar into8xml.sl ../../data/wikipedia-xml/en.xml.files &


# Sample command to generate corpus file from XML files

./build-corpus.sh en.xml.files corpus-out/

# the build-corpus.sh can be stopped and restarted. to stop the running process, create a file called 'stop.command'
touch stop.command

#to allow the command to run again next time, just remove or rename that file
rm stop.command

# build-corpus.sh also outputs log and progress files next to its output. if the process is stopped, it'll output state files to its corpus output directory so that it can continue later.


# Run top and filter by process name (also, press 'c' to show full process path)
top -p $(pgrep -d',' process-name)

# If I need to restart generating the plain corpus from the xml, then I should also remove all the *.plain.txt from wikipedia-articles.
# the scripts to do this can be generated with:
java -jar sleep.jar removeplain8.sl removeplain8.sl ../../data/wikipedia-xml/en.files
# and then these scripts can be run directly from ../../wikipedia-xml/ directory