Skip to content

Commit

Permalink
More: More robust parsing + dates
Browse files Browse the repository at this point in the history
  • Loading branch information
andersjo committed Mar 10, 2016
1 parent 39ccaa0 commit 1d71d02
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 22 deletions.
5 changes: 5 additions & 0 deletions src/main/java/MboxMessages.java
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ private Message parseMessage(InputStream messageInstream) {
return messageBuilder.parseMessage(messageInstream);
} catch (IOException e) {
e.printStackTrace();
} catch (NullPointerException e) {
// Ignore a parsing error that happens inside of the Apache mime parser.
if (!e.getStackTrace()[0].getClassName().equals("org.apache.james.mime4j.io.MimeBoundaryInputStream")) {
e.printStackTrace();
}
}
return null;
}
Expand Down
62 changes: 57 additions & 5 deletions src/main/java/MsgProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,26 @@
import org.apache.james.mime4j.dom.address.AddressList;
import org.apache.james.mime4j.dom.address.Mailbox;
import org.apache.james.mime4j.dom.address.MailboxList;
import org.apache.james.mime4j.dom.datetime.DateTime;
import org.apache.james.mime4j.field.datetime.parser.DateTimeParser;
import org.apache.james.mime4j.field.datetime.parser.ParseException;
import org.apache.james.mime4j.field.datetime.parser.TokenMgrError;
import org.apache.james.mime4j.stream.Field;
import org.apache.james.mime4j.util.MimeUtil;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;

Expand Down Expand Up @@ -49,8 +59,6 @@ public ProcessedMsg process(Message message) {
processedMsg.paragraphs = paragraphs;
extractFields(message, processedMsg);



} catch (MessageProcessingError e) {
return null;
}
Expand Down Expand Up @@ -175,10 +183,54 @@ private void extractFields(Message message, ProcessedMsg processedMessage) throw
}

processedMessage.subject = message.getSubject();
processedMessage.date = message.getDate();
processedMessage.date = parseComplexDateTime(message);
if (processedMessage.date == null) {
processedMessage.date = parseSimpleDateTime(message);
}

processedMessage.messageId = message.getMessageId();
processedMessage.newsgroups = extractNewsgroups(message);
}

private OffsetDateTime parseSimpleDateTime(Message message) {
String body = message.getHeader().getField("Date").getBody();

Pattern simpleDatePattern = Pattern.compile("(\\d{4})/(\\d{2})/(\\d{2})");
Matcher matcher = simpleDatePattern.matcher(body);
if (matcher.matches()) {
int year = Integer.parseInt(matcher.group(1));
int month = Integer.parseInt(matcher.group(2));
int day = Integer.parseInt(matcher.group(3));
return OffsetDateTime.of(year, month, day, 0, 0, 0, 0, ZoneOffset.UTC);
}
return null;
}

private OffsetDateTime parseComplexDateTime(Message message) {
String body = message.getHeader().getField("Date").getBody();
try {
DateTime jmimeDateTime = new DateTimeParser(new StringReader(body)).parseAll();
int offsetMinutes = 0;
int offsetHours = 0;

// The DateTime class is not well documented. From reading the source it seems that the
// offset in minutes can be retrieved from the timezone value as below:
if (jmimeDateTime.getTimeZone() != Integer.MIN_VALUE) {
offsetMinutes = ((jmimeDateTime.getTimeZone() / 100) * 60) + jmimeDateTime.getTimeZone() % 100;

offsetHours = offsetMinutes / 60;
offsetMinutes = offsetMinutes % 60;
}

return OffsetDateTime.of(jmimeDateTime.getYear(), jmimeDateTime.getMonth(), jmimeDateTime.getDay(),
jmimeDateTime.getHour(), jmimeDateTime.getMinute(), jmimeDateTime.getSecond(), 0,
ZoneOffset.ofHoursMinutes(offsetHours, offsetMinutes));

} catch (ParseException | java.time.DateTimeException | TokenMgrError ignored) {

}
return null;


}

Expand Down
13 changes: 8 additions & 5 deletions src/main/java/ProcessedMsg.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

public class ProcessedMsg {
Expand All @@ -12,10 +13,10 @@ public class ProcessedMsg {
public String messageId;
public List<String> newsgroups = new ArrayList<>();
public List<String> paragraphs = new ArrayList<>();
public Date date;
public OffsetDateTime date;

public static final String[] columns = {"docId", "messageId", "senderName", "senderEmail", "subject", "langID",
"newsgroups", "paragraphs", "date"};
public static final String[] columns = {"doc_id", "message_id", "sender_name", "sender_email", "subject", "lang_id",
"newsgroups", "paragraphs", "utc_date", "timezone"};

/**
* empty constructor, elements are filled from outside. No setters (oooh!)
Expand Down Expand Up @@ -46,7 +47,9 @@ public List<String> rowData() {
elements.add(langCode);
elements.add(String.join(",", newsgroups));
elements.add(String.join("\u2029", paragraphs));
elements.add(date.toString());
elements.add(date.withOffsetSameInstant(ZoneOffset.UTC).toString());
elements.add(date.getOffset().toString());


return elements;
}
Expand Down
31 changes: 19 additions & 12 deletions src/main/java/UsenetImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ public void importDir(Path dir) throws IOException, InterruptedException {
try {
future.get();
} catch (ExecutionException e) {
e.printStackTrace();
Throwable cause = e.getCause();
if (cause != null)
cause.printStackTrace();
}

}
Expand Down Expand Up @@ -102,20 +104,25 @@ public SingleMboxImporter(Path mboxFile) {

@Override
public Boolean call() throws Exception {
MsgProcessor msgProcessor = setupMsgProcessor();
System.out.println("Importing " + mboxFile.getFileName());
MboxMessages messages = new MboxMessages(openInputStream());
for (Message message : messages) {
String messageId = Long.toString(docIdCounter.incrementAndGet());
ProcessedMsg processedMessage = msgProcessor.process(message);
processedMessage.docId = messageId;
if (processedMessage.isValid()) {
synchronized (csvOut) {
csvOut.printRecord(processedMessage.rowData());
try {
MsgProcessor msgProcessor = setupMsgProcessor();
MboxMessages messages = new MboxMessages(openInputStream());
for (Message message : messages) {
String messageId = Long.toString(docIdCounter.incrementAndGet());
ProcessedMsg processedMessage = msgProcessor.process(message);
processedMessage.docId = messageId;
if (processedMessage.isValid()) {
synchronized (csvOut) {
csvOut.printRecord(processedMessage.rowData());
}
}
}
return true;
} catch (Exception e) {
System.err.println("Encountering exception while processing " + mboxFile.getFileName().toString());
e.printStackTrace();
throw e;
}
return true;
}

private MsgProcessor setupMsgProcessor() {
Expand Down

0 comments on commit 1d71d02

Please sign in to comment.