Permalink
Browse files

Integrated Mbox Iterator to Apache James Mime4j

 * integrated https://github.com/ieugen/mbox-iterator into mime4j
 * bumped junit version to 4.10
 * added Guava in dependency management (used only in mbox-iterator)

git-svn-id: https://svn.apache.org/repos/asf/james/mime4j/trunk@1351619 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 7679f5d commit 9c3b4c5557bc8452824fa40157c2cd8d0c98fe00 Ioan Eugen Stan committed Jun 19, 2012
View
@@ -17,7 +17,8 @@
specific language governing permissions and limitations
under the License.
-->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
@@ -38,6 +39,11 @@
<artifactId>apache-mime4j-dom</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j-mbox-iterator</artifactId>
+ <version>0.8-SNAPSHOT</version>
+ </dependency>
<dependency>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j-storage</artifactId>
@@ -0,0 +1,85 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+package org.apache.james.mime4j.samples.mbox;
+
+import com.google.common.base.Charsets;
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.dom.Message;
+import org.apache.james.mime4j.dom.MessageBuilder;
+import org.apache.james.mime4j.mboxiterator.CharBufferWrapper;
+import org.apache.james.mime4j.mboxiterator.MboxIterator;
+import org.apache.james.mime4j.message.DefaultMessageBuilder;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.charset.CharsetEncoder;
+
+/**
+ * Simple example of how to use Apache Mime4j Mbox Iterator. We split one mbox file file into
+ * individual email messages.
+ */
+public class IterateOverMbox {
+
+ private final static CharsetEncoder ENCODER = Charsets.UTF_8.newEncoder();
+
+ // simple example of how to split an mbox into individual files
+ public static void main(String[] args) throws Exception {
+ if (args.length != 1) {
+ System.out.println("Please supply a path to an mbox file to parse");
+ }
+
+ final File mbox = new File(args[0]);
+ long start = System.currentTimeMillis();
+ int count = 0;
+
+ for (CharBufferWrapper message : MboxIterator.fromFile(mbox).charset(Charsets.UTF_8).build()) {
+ // saveMessageToFile(count, buf);
+ System.out.println(messageSummary(message.asInputStreamUTF8Encoded()));
+ count++;
+ }
+ System.out.println("Found " + count + " messages");
+ long end = System.currentTimeMillis();
+ System.out.println("Done in: " + (end - start) + " milis");
+ }
+
+ private static void saveMessageToFile(int count, CharBuffer buf) throws IOException {
+ FileOutputStream fout = new FileOutputStream(new File("target/messages/msg-" + count));
+ FileChannel fileChannel = fout.getChannel();
+ ByteBuffer buf2 = ENCODER.encode(buf);
+ fileChannel.write(buf2);
+ fileChannel.close();
+ fout.close();
+ }
+
+ private static String messageSummary(InputStream messageBytes) throws IOException, MimeException {
+ MessageBuilder builder = new DefaultMessageBuilder();
+ Message message = builder.parseMessage(messageBytes);
+ return String.format("\nMessage %s \n" +
+ "Sent by:\t%s\n" +
+ "To:\t%s\n",
+ message.getSubject(),
+ message.getSender(),
+ message.getTo());
+ }
+}
View
@@ -0,0 +1,17 @@
+# Apache James Mime4j Mbox Iterator
+
+Apache James Mbox Iterator provides an iterator like interface over mbox files. It's designed
+to allow easy parsing with mime4j.
+
+It uses NIO memory mapped files and should provide fast processing capabilities.
+
+## Dependencies
+
+It has no direct dependencies other than the JDK and Google Guava (for convenience).
+
+## Building
+
+The project uses Maven as a build tool. To build go to the root directory and run:
+
+$ mvn clean package
+
View
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j-project</artifactId>
+ <version>0.8-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>apache-mime4j-mbox-iterator</artifactId>
+ <packaging>jar</packaging>
+
+ <name>Apache JAMES Mime4j (Mbox Iterator)</name>
+ <description>Provides a fast iterator like interface for Mbox files using NIO.</description>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+ </dependencies>
+
+</project>
@@ -0,0 +1,98 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+package org.apache.james.mime4j.mboxiterator;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Preconditions;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+
+/**
+ * Wraps a CharBuffer and exposes some convenience methods to easy parse with Mime4j.
+ */
+public class CharBufferWrapper {
+
+ private final CharBuffer messageBuffer;
+
+ public CharBufferWrapper(CharBuffer messageBuffer) {
+ Preconditions.checkNotNull(messageBuffer);
+ this.messageBuffer = messageBuffer;
+ }
+
+ public InputStream asInputStreamUTF8Encoded() {
+ return new ByteBufferInputStream(Charsets.UTF_8.encode(messageBuffer));
+ }
+
+ public InputStream asInputStream(Charset encoding) {
+ return new ByteBufferInputStream(encoding.encode(messageBuffer));
+ }
+
+ public char[] asCharArray() {
+ return messageBuffer.array();
+ }
+
+ @Override
+ public String toString() {
+ return messageBuffer.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return messageBuffer.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return messageBuffer.equals(obj);
+ }
+
+ /**
+ * Provide an InputStream view over a ByteBuffer.
+ */
+ private static class ByteBufferInputStream extends InputStream {
+
+ private final ByteBuffer buf;
+
+ private ByteBufferInputStream(ByteBuffer buf) {
+ this.buf = buf;
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (!buf.hasRemaining()) {
+ return -1;
+ }
+ return buf.get() & 0xFF;
+ }
+
+ @Override
+ public int read(byte[] bytes, int off, int len) throws IOException {
+ if (!buf.hasRemaining()) {
+ return -1;
+ }
+ buf.get(bytes, off, Math.min(len, buf.remaining()));
+ return len;
+ }
+
+ }
+}
@@ -0,0 +1,38 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one *
+ * or more contributor license agreements. See the NOTICE file *
+ * distributed with this work for additional information *
+ * regarding copyright ownership. The ASF licenses this file *
+ * to you under the Apache License, Version 2.0 (the *
+ * "License"); you may not use this file except in compliance *
+ * with the License. You may obtain a copy of the License at *
+ * *
+ * http://www.apache.org/licenses/LICENSE-2.0 *
+ * *
+ * Unless required by applicable law or agreed to in writing, *
+ * software distributed under the License is distributed on an *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY *
+ * KIND, either express or implied. See the License for the *
+ * specific language governing permissions and limitations *
+ * under the License. *
+ ****************************************************************/
+package org.apache.james.mime4j.mboxiterator;
+
+/**
+ * Collection of From_ line patterns. Messages inside an mbox are separated by these lines.
+ * The pattern is usually constant in a file but depends on the mail agents that wrote it.
+ * It's possible that more mailer agents wrote in the same file using different From_ lines.
+ */
+public interface FromLinePatterns {
+
+ /**
+ * Match a line like: From ieugen@apache.org Fri Sep 09 14:04:52 2011
+ */
+ static final String DEFAULT = "^From \\S+@\\S.*\\d{4}$";
+
+ /**
+ * Other type of From_ line: From MAILER-DAEMON Wed Oct 05 21:54:09 2011
+ */
+
+
+}
Oops, something went wrong.

0 comments on commit 9c3b4c5

Please sign in to comment.