Skip to content

Commit

Permalink
many updates
Browse files Browse the repository at this point in the history
  • Loading branch information
Ben Collier committed Apr 21, 2011
1 parent 0001207 commit 4f14fc4
Show file tree
Hide file tree
Showing 22 changed files with 54 additions and 452,491 deletions.
Binary file modified bin/org/mediawiki/dumper/Dumper$OutputWrapper.class
Binary file not shown.
Binary file modified bin/org/mediawiki/dumper/Dumper.class
Binary file not shown.
Binary file modified bin/org/mediawiki/dumper/ProgressFilter.class
Binary file not shown.
Binary file modified bin/org/mediawiki/importer/FlatWriter.class
Binary file not shown.
Binary file modified build/mwdumper.jar
Binary file not shown.
46,958 changes: 0 additions & 46,958 deletions build/stubmetahistorylog3-21.log

This file was deleted.

3 changes: 0 additions & 3 deletions build/train.log

This file was deleted.

202,716 changes: 0 additions & 202,716 deletions build/train.xml

This file was deleted.

65 changes: 0 additions & 65 deletions build/trainjavaout.log

This file was deleted.

202,717 changes: 0 additions & 202,717 deletions build/trains.sql

This file was deleted.

5 changes: 0 additions & 5 deletions compilerun.sh

This file was deleted.

5 changes: 3 additions & 2 deletions src/org/mediawiki/dumper/Dumper.java
Expand Up @@ -35,7 +35,7 @@
on next one or end of sequence, write out
[so for 1.4 schema we can be friendly]
progress report: [TODO]
progress report:
if possible, a percentage through file. this might not be possible.
rates and counts definitely
Expand Down Expand Up @@ -74,6 +74,7 @@


class Dumper {

public static void main(String[] args) throws IOException, ParseException {
InputStream input = null;
OutputWrapper output = null;
Expand Down Expand Up @@ -243,7 +244,7 @@ static DumpWriter openOutputSink(OutputWrapper output, String format, String par
else if (format.equals("sphinx"))
return new SphinxWriter(output.getFileStream());
else if (format.equals("flatfile"))
return new FlatWriter(output.getFileStream());
return new FlatWriter(output.getFileStream(), param);
else if (format.equals("mysql") || format.equals("pgsql") || format.equals("sql")) {
SqlStream sqlStream = output.getSqlStream();
SqlWriter ret;
Expand Down
4 changes: 2 additions & 2 deletions src/org/mediawiki/dumper/ProgressFilter.java
Expand Up @@ -36,7 +36,7 @@
public class ProgressFilter extends PageFilter {
int pages = 0;
int revisions = 0;
int interval = 1000;
int interval = 100000000;
MessageFormat format = new MessageFormat("{0} pages ({1}/sec), {2} revs ({3}/sec)");
long start = System.currentTimeMillis();

Expand Down Expand Up @@ -70,7 +70,7 @@ public void writeEndWiki() throws IOException {
}

private void reportProgress() {
if (revisions % interval == 0)
if (revisions % 100000 == 0)
showProgress();
}

Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/dumper/gui/DumperGui.java
Expand Up @@ -190,7 +190,7 @@ void startImport(String inputFile) throws IOException, SQLException {
assert schemaReady;
assert !running;

// TODO work right ;)
// work right ;)
final InputStream stream = Tools.openInputFile(inputFile);
//DumpWriter writer = new MultiWriter();
conn.setCatalog(dbname);
Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/dumper/gui/DumperWindow.java
Expand Up @@ -184,7 +184,7 @@ protected void onStartButtonActionPerformed(java.awt.event.ActionEvent evt) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (SQLException e1) {
// TODO Auto-generated catch block
// Auto-generated catch block
e1.printStackTrace();
}
}
Expand Down
56 changes: 41 additions & 15 deletions src/org/mediawiki/importer/FlatWriter.java
Expand Up @@ -30,8 +30,8 @@
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.util.Calendar;

public class FlatWriter implements DumpWriter {
Expand All @@ -40,19 +40,28 @@ public class FlatWriter implements DumpWriter {
protected static final int DELETED_COMMENT = 2;
protected static final int DELETED_USER = 4;
protected static final int DELETED_RESTRICTED = 8;
protected BufferedOutputStream writer;
//protected BufferedOutputStream writer;
protected static final Integer ONE = new Integer(1);
protected static final Integer ZERO = new Integer(0);
protected BufferedWriter pageWriter = null;
protected BufferedWriter revWriter = null;
//protected BufferedOutputStream revWriter;
private Page currentPage;
private Revision lastRevision;
protected String encoding;
protected BufferedWriter writer;

public FlatWriter(OutputStream output) throws IOException {
public FlatWriter(OutputStream output, String pageOutFileLocation) throws IOException {
stream = output;
writer = new BufferedOutputStream(stream);
pageWriter = new BufferedWriter(new FileWriter("/home/bcollier/Data/WPDump/pages.txt"));
revWriter = new BufferedWriter(new FileWriter("/home/bcollier/Data/WPDump/revisions.txt"));
//writer = new BufferedOutputStream(stream);
pageWriter = new BufferedWriter(new FileWriter(pageOutFileLocation));
//revWriter = new BufferedWriter(new FileWriter("/home/bcollier/revisions_mwdumper.txt"));
//revWriter = new BufferedOutputStream(System.out, OUT_BUF_SZ);
//revWriter = writer;

encoding = "utf-8";
revWriter = new BufferedWriter(new OutputStreamWriter(stream, "UTF8"));

}

protected String timestampFormat(Calendar time) {
Expand Down Expand Up @@ -84,14 +93,14 @@ public void close() throws IOException {
}

public void writeStartWiki() throws IOException{
writer.write("Begin Writing...\n".getBytes());
//writer.write("Begin Writing...\n".getBytes());
}

public void writeEndWiki() throws IOException{
writer.write("Finished Writing.".getBytes());
writer.write("\n".getBytes());
writer.flush();
writer.close();
//writer.write("Finished Writing.".getBytes());
//writer.write("\n".getBytes());
//writer.flush();
//writer.close();
pageWriter.flush();
revWriter.flush();
pageWriter.close();
Expand All @@ -106,8 +115,8 @@ public void writeSiteinfo(Siteinfo info) throws IOException{
public void writeStartPage(Page page) throws IOException{
currentPage = page;
lastRevision = null;
writer.write("Starting a new page".getBytes());
writer.write("\n".getBytes());
//writer.write("Starting a new page".getBytes());
//writer.write("\n".getBytes());

//pageWriter.write("page\n");
}
Expand All @@ -125,30 +134,47 @@ public void writeEndPage() throws IOException{
//writer.write("\n".getBytes());
}

//clean the string of tabs and new lines
public String clStr(String dirtyString){
String cleanString;
cleanString = dirtyString.replace("\n", " ");
return cleanString.replace("\t", " ");

}

public void writeRevision(Revision revision) throws IOException{

int rev_deleted = 0;
if (revision.Contributor.Username==null) rev_deleted |= DELETED_USER;
if (revision.Comment==null) rev_deleted |= DELETED_COMMENT;
if (revision.Text==null) rev_deleted |= DELETED_TEXT;


revWriter.write(Integer.toString(revision.Id));
revWriter.write("\t");
revWriter.write(Integer.toString(currentPage.Id));
revWriter.write("\t");
revWriter.write(revision.Comment == null ? "" : revision.Comment);
revWriter.write(revision.Comment == null ? "" : clStr(revision.Comment));
revWriter.write("\t");
revWriter.write(Integer.toString(revision.Contributor.Username == null ? ZERO : new Integer(revision.Contributor.Id)));
revWriter.write("\t");
revWriter.write(revision.Contributor.Username == null ? "" : revision.Contributor.Username);
revWriter.write("\t");
revWriter.write(timestampFormat(revision.Timestamp));
revWriter.write("\t");
revWriter.write(Integer.toString(revision.Timestamp.get(Calendar.YEAR)));
revWriter.write("\t");
revWriter.write(Integer.toString(revision.Timestamp.get(Calendar.MONTH)+1));
revWriter.write("\t");
revWriter.write(Integer.toString(revision.Minor ? ONE : ZERO));
revWriter.write("\t");
revWriter.write(Integer.toString(rev_deleted==0 ? ZERO : new Integer(rev_deleted)));
revWriter.write("\t");
revWriter.write(revision.Text == null ? "0" : Integer.toString(lengthUtf8(revision.Text)));
revWriter.write("\t");
revWriter.write(revision.Text == null ? "" : clStr(revision.Text));
revWriter.write("\n");

lastRevision = revision;

//use this line to get the text
Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/importer/Revision.java
Expand Up @@ -36,7 +36,7 @@ public class Revision {
public boolean Minor;

public boolean isRedirect() {
// FIXME

return Text.startsWith("#REDIRECT ") || Text.startsWith("#redirect ");
}

Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/importer/SqlServerStream.java
Expand Up @@ -10,7 +10,7 @@ public class SqlServerStream implements SqlStream {
private Connection connection;

public SqlServerStream(Connection conn) {
connection = conn; // TODO
connection = conn; //
}

public void writeComment(CharSequence sql) {
Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/importer/SqlWriter.java
Expand Up @@ -167,7 +167,7 @@ public void writeSiteinfo(Siteinfo info) throws IOException {


protected String commentSafe(String text) {
// TODO
//
return text;
}

Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/importer/Title.java
Expand Up @@ -53,7 +53,7 @@ public Title(String prefixedTitle, NamespaceSet namespaces) {
}

public static String ValidateTitleChars(String text) {
// FIXME
//
return text;
}

Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/importer/TitleTest.java
Expand Up @@ -110,7 +110,7 @@ public void testTitleStringNamespaceSet() {
* Test method for 'org.mediawiki.importer.Title.ValidateTitleChars(String)'
*/
/*public void testValidateTitleChars() {
// FIXME
//
}*/

/*
Expand Down
2 changes: 1 addition & 1 deletion src/org/mediawiki/importer/XmlDumpWriter.java
Expand Up @@ -63,7 +63,7 @@ public void writeStartWiki() throws IOException {
{"xsi:schemaLocation", ns + " " + schema},
{"version", version},
{"xml:lang", "en"}});
// TODO: store and keep the xml:lang
// store and keep the xml:lang
}

public void writeEndWiki() throws IOException {
Expand Down

0 comments on commit 4f14fc4

Please sign in to comment.